Source code for gatelfpytorchjson.modelwrapperdefault

from . modelwrapper import ModelWrapper
from . embeddingsmodule import EmbeddingsModule
from . ngrammodule import NgramModule
import os
import torch
import torch.nn
import torch.optim
from torch.autograd import Variable as V
from .classificationmodule import ClassificationModule
from .takefromtuple import TakeFromTuple
import logging
import sys
import statistics
import pickle
from gatelfdata import Dataset
import numpy as np
import pkgutil
import timeit
import logging
import signal


# Basic usage:
# ds = Dataset(metafile)
# wrapper = ModelWrapperSimple(ds) # or some other subclass
# wrapper.train()
# # get some data for application some where
# instances = get_them()
# preditions = wrapper.apply(instances)
# NOTE: maybe use the same naming conventions as scikit learn here!!

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
streamhandler = logging.StreamHandler(stream=sys.stderr)
formatter = logging.Formatter(
                '%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
streamhandler.setFormatter(formatter)
logger.addHandler(streamhandler)


[docs]def f(value): """Format a float value to have 3 digits after the decimal point""" return "{0:.3f}".format(value)
[docs]class ModelWrapperDefault(ModelWrapper):
[docs] def init_from_dataset(self): """Set the convenience attributes which we get from the dataset instance""" dataset = self.dataset self.metafile = dataset.metafile self.float_idxs = dataset.get_float_feature_idxs() self.index_idxs = dataset.get_index_feature_idxs() self.indexlist_idxs = dataset.get_indexlist_feature_idxs() self.float_feats = dataset.get_float_features() self.index_feats = dataset.get_index_features() self.indexlist_feats = dataset.get_indexlist_features() self.featureinfo = {"num_idxs": self.float_idxs, "nom_idxs": self.index_idxs, "ngr_idxs": self.indexlist_idxs} self.info = dataset.get_info()
# This requires an initialized dataset instance def __init__(self, dataset, config={}, cuda=None): """This requires a gatelfdata Dataset instance and can optionally take a dictionary with configuration/initialization options (NOT SUPPORTED YET). If cuda is None, then if cuda is available it will be used. True and False require and prohibit the use of cuda unconditionally. Config settings: stopfile: a file path, if found training is stopped """ super().__init__(dataset, config=config) self.config = config logger.debug("Init with config=%s" % (config,)) if "cuda" in config and config["cuda"] is not None: cuda = config["cuda"] self.cuda = cuda self.checkpointnr = 0 self.stopfile = os.path.join(os.path.dirname(dataset.metafile), "STOP") if "stopfile" in config and config["stopfile"] is not None: self.stopfile = config["stopfile"] self.stopfile = os.path.abspath(self.stopfile) logging.getLogger(__name__).debug("Set the stop file to %s" % self.stopfile) self.override_learningrate = None if "learningrate" in config and config["learningrate"]: self.override_learningrate = config["learningrate"] cuda_is_available = torch.cuda.is_available() if self.cuda is None: enable_cuda = cuda_is_available else: enable_cuda = self.cuda self._enable_cuda = enable_cuda # this tells us if we should actually set cuda or not logger.debug("Init cuda=%s enable_cuda=%s" % (cuda, self._enable_cuda,)) self.dataset = dataset self.init_from_dataset() # various configuration settings which can be set before passing on control to the # task-speicific initialization self.best_model_saved = False self.validate_every_batches = None self.validate_every_epochs = 1 self.validate_every_instances = None self.report_every_batches = None self.report_every_instances = 500 self.is_data_prepared = False self.valset = None # Validation set created by prepare_data self.lossfunction = None self.module = None # the init_<TASK> method actually sets this!! self.random_seed = 0 # if the config requires a specific module needs to get used, create it here, otherwise # create the module needed for sequences or non-sequences # IMPORTANT! the optimizer needs to get created after the module has been moved to a GPU # using cuda()!!! if "module" in config and config["module"] is not None: logger.debug("Init, modules importable: %s" % ([x[1] for x in pkgutil.iter_modules(path=".gatelfpytorchjson")],)) # TODO: figure out how to do this right!! ptclassname = config["module"] logger.debug("Init import, trying to use class/file: %s" % (ptclassname,)) import importlib # NOTE: # the following worked and seemed to be required on one computer ... # parent = importlib.import_module(".."+ptclassname, package=".gatelfpytorchjson.modules."+ptclassname) # this works fine: parent = importlib.import_module("gatelfpytorchjson.modules."+ptclassname) class_ = getattr(parent, ptclassname) self.module = class_(dataset, config=config) # TODO: best method to configure the loss for the module? for now we expect a static method # in the class that returns it self.lossfunction = self.module.get_lossfunction(config=config) if self._enable_cuda: self.module.cuda() self.lossfunction.cuda() self.optimizer = self.module.get_optimizer(config=config) else: if self.info["isSequence"]: self.init_sequencetagging(dataset) else: if self.info["targetType"] == "nominal": self.init_classification(dataset) else: raise Exception("Target type not yet implemented: %s" % self.info["targetType"]) if self._enable_cuda: self.module.cuda() self.lossfunction.cuda() # get the parameters for the optimizer, but make sure we do not include parameters for fixed layers! params = filter(lambda p: p.requires_grad, self.module.parameters()) # self.optimizer = torch.optim.SGD(self.module.parameters(), lr=0.001, momentum=0.9) # self.optimizer = torch.optim.SGD(self.module.parameters(), lr=(self.override_learningrate or 0.001)) # self.optimizer = torch.optim.Adadelta(params, lr=1.0, rho=0.9, eps=1e-06, weight_decay=0) # self.optimizer = torch.optim.Adagrad(params, lr=0.01, lr_decay=0, weight_decay=0, initial_accumulator_value=0) self.optimizer = torch.optim.Adam(params, lr=(self.override_learningrate or 0.001), betas=(0.9, 0.999), eps=1e-08, weight_decay=0 ) # self.optimizer = torch.optim.Adamax(params, lr=0.002, betas=(0.9, 0.999), eps=1e-08, weight_decay=0) # self.optimizer = torch.optim.ASGD(params, lr=0.01, lambd=0.0001, alpha=0.75, t0=1000000.0, weight_decay=0) # self.optimizer = torch.optim.RMSprop(params, lr=0.01, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False) # self.optimizer = torch.optim.Rprop(params, lr=0.01, etas=(0.5, 1.2), step_sizes=(1e-06, 50)) # self.optimizer = torch.optim.SGD(params, lr=0.1, momentum=0, dampening=0, weight_decay=0, nesterov=False) # NOTE/TODO: check out how to implement a learning rate scheduler that makes the LR depend e.g. on epoch, see # http://pytorch.org/docs/master/optim.html # e.g. every 10 epochs, make lr half of what it was: # self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=10, gamma=0.5) # self.optimizer = torch.optim.SGD(params, lr=0.1, momentum=0.0) self.interrupted = False signal.signal(signal.SIGINT, self._signal_handler) # This is mainly used at application time, for training, the same thing happens in init. # TODO: this should get moved into a common superclass for all modelwrappers!
[docs] def set_cuda(self, flag): """Advise to use CUDA if flag is True, or CPU if false. True is ignored if cuda is not available""" if flag and torch.cuda.is_available(): self.module.cuda() self.lossfunction.cuda() self._enable_cuda = True else: self.module.cpu() self.lossfunction.cpu() self._enable_cuda = False
def _signal_handler(self, sig, frame): logger.info("Received interrupt signal, setting interrupt flag") self.interrupted = True
[docs] def init_classification(self, dataset): n_classes = self.info["nClasses"] inputlayers = [] # keep track of the number of input layer output dimensions inlayers_outdims = 0 # if we have numeric features, create the numeric input layer if len(self.float_idxs) > 0: n_in = len(self.float_idxs) n_hidden = ModelWrapper.makeless(n_in, p1=0.5) lin = torch.nn.Linear(n_in, n_hidden) act = torch.nn.ELU() layer = torch.nn.Sequential(lin, act) inlayers_outdims += n_hidden lname = "input_numeric" inputlayers.append((layer, {"type": "numeric", "name": lname})) pass # if we have nominal features, create all the layers for those # TODO: may need to handle onehot features differently!! # remember which layers we already have for an embedding id nom_layers = {} for i in range(len(self.index_feats)): nom_feat = self.index_feats[i] nom_idx = self.index_idxs[i] vocab = nom_feat.vocab emb_id = vocab.emb_id if emb_id in nom_layers: emblayer = nom_layers.get(emb_id) else: emblayer = EmbeddingsModule(vocab) nom_layers[emb_id] = emblayer lname = "input_emb_%s_%s" % (i, emb_id) inputlayers.append((emblayer, {"type": "nominal", "name": lname})) inlayers_outdims += emblayer.emb_dims for i in range(len(self.indexlist_feats)): ngr_feat = self.indexlist_feats[i] nom_idx = self.indexlist_idxs[i] vocab = ngr_feat.vocab emb_id = vocab.emb_id if emb_id in nom_layers: emblayer = nom_layers.get(emb_id) else: emblayer = EmbeddingsModule(vocab) nom_layers[emb_id] = emblayer lname = "input_ngram_%s_%s" % (i, emb_id) ngram_layer = self.config.get("ngram_layer") if ngram_layer is None: ngram_layer = "cnn" ngramlayer = NgramModule(emblayer, method=ngram_layer) # lstm or cnn inputlayers.append((ngramlayer, {"type": "ngram", "name": lname})) inlayers_outdims += ngramlayer.out_dim # Now create the hidden layers hiddenlayers = [] # THIS WAS THE OLD APPROACH, using TWO linear layers, separated by ELU # for now, one hidden layer for compression and another # to map to the number of classes #n_hidden1lin_out = ModelWrapper.makeless(inlayers_outdims) #hidden1lin = torch.nn.Linear(inlayers_outdims, n_hidden1lin_out) #hidden1act = torch.nn.ELU() #hidden2 = torch.nn.Linear(n_hidden1lin_out, n_classes) #hidden = torch.nn.Sequential(hidden1lin, # hidden1act, hidden2) # INSTEAD we just use a single linear layer, no nonlinearity hidden = torch.nn.Linear(inlayers_outdims, n_classes) hiddenlayers.append((hidden, {"name": "hidden"})) # Create the output layer out = torch.nn.LogSoftmax(dim=1) outputlayer = (out, {"name": "output"}) # create the module and store it self.module = ClassificationModule(inputlayers, hiddenlayers, outputlayer, self.featureinfo) # Decide on the lossfunction function here for training later! self.lossfunction = torch.nn.NLLLoss(ignore_index=-1)
[docs] def init_sequencetagging(self, dataset): """Build the module for sequence tagging.""" # NOTE: For sequence tagging, the shape of our input is slightly different: # - the indep is a list of features, as before # - but for each feature, there is a (padded) list of values # - each dep is also a padded list of values # In theory we could combine the features before going into the LSTM, or # we have different LSTMs for each feature and combine afterwards. # Here we combine before, so the output of e.g. a Linear layer is not just # a vector, but a matrix where one dimension is the batch, one dimension is the sequence # and one dimension is the value(vector). If we have batch size b, max sequence length s # and value dimension d, we should get shape b,s,d if batch_first is True, otherwise s,b,d n_classes = self.info["nClasses"] inputlayers = [] # keep track of the number of input layer output dimensions inlayers_outdims = 0 # if we have numeric features, create the numeric input layer if len(self.float_idxs) > 0: n_in = len(self.float_idxs) n_hidden = ModelWrapper.makeless(n_in, p1=0.5) lin = torch.nn.Linear(n_in, n_hidden) act = torch.nn.ELU() layer = torch.nn.Sequential(lin, act) inlayers_outdims += n_hidden lname = "input_numeric" inputlayers.append((layer, {"type": "numeric", "name": lname})) pass # if we have nominal features, create all the layers for those # TODO: may need to handle onehot features differently!! # remember which layers we already have for an embedding id nom_layers = {} for i in range(len(self.index_feats)): nom_feat = self.index_feats[i] nom_idx = self.index_idxs[i] vocab = nom_feat.vocab emb_id = vocab.emb_id if emb_id in nom_layers: emblayer = nom_layers.get(emb_id) else: emblayer = EmbeddingsModule(vocab) nom_layers[emb_id] = emblayer lname = "input_emb_%s_%s" % (i, emb_id) inputlayers.append((emblayer, {"type": "nominal", "name": lname})) inlayers_outdims += emblayer.emb_dims for i in range(len(self.indexlist_feats)): ngr_feat = self.indexlist_feats[i] nom_idx = self.indexlist_idxs[i] vocab = ngr_feat.vocab emb_id = vocab.emb_id if emb_id in nom_layers: emblayer = nom_layers.get(emb_id) else: emblayer = EmbeddingsModule(vocab) nom_layers[emb_id] = emblayer lname = "input_ngram_%s_%s" % (i, emb_id) ngramlayer = NgramModule(emblayer) inputlayers.append((ngramlayer, {"type": "ngram", "name": lname})) inlayers_outdims += ngramlayer.out_dim # Now create the hidden layers hiddenlayers = [] # TODO: originally we always had this layer between the inputs and the LSTM, but # it may be better to just use a NOOP instead and just use the concatenated inputs. if False: n_hidden1lin_out = ModelWrapper.makeless(inlayers_outdims) hidden1lin = torch.nn.Linear(inlayers_outdims, n_hidden1lin_out) hidden1act = torch.nn.ELU() hidden1layer = torch.nn.Sequential(hidden1lin, hidden1act) else: n_hidden1lin_out = inlayers_outdims hidden1layer = None # for now, the size of the hidden layer is identical to the input size, up to # a maximum of 200 lstm_hidden_size = min(200, n_hidden1lin_out) lstm_bidirectional = False ## Now that we have combined the features, we create the lstm hidden2 = torch.nn.LSTM(input_size=n_hidden1lin_out, hidden_size=lstm_hidden_size, num_layers=1, # dropout=0.1, bidirectional=lstm_bidirectional, batch_first=True) # the outputs of the LSTM are of shape b, seq, hidden # We want to get softmax outputs for each, so we need to get this to # b, seq, nclasses # NOTE: we cannot use sequential here since the LSTM returns a tuple and # Sequential does not properly deal with this. So instead of adding the LSTM directly # we wrap it in a tiny custom wrapper that just returns the first element of the # tuple in the forward step hidden2 = TakeFromTuple(hidden2, which=0) # NOTE: if the LSTM is bidirectional, we need to double the size hidden3_size = lstm_hidden_size if lstm_bidirectional: hidden3_size *= 2 hidden3 = torch.nn.Linear(hidden3_size, n_classes) if not hidden1layer: hidden = torch.nn.Sequential(hidden2, hidden3) else: hidden = torch.nn.Sequential(hidden1layer, hidden2, hidden3) hiddenlayers.append((hidden, {"name": "hidden"})) # Create the output layer out = torch.nn.LogSoftmax(dim=2) outputlayer = (out, {"name": "output"}) # create the module and store it self.module = ClassificationModule(inputlayers, hiddenlayers, outputlayer, self.featureinfo) # For sequence tagging we cannot use CrossEntropyLoss self.lossfunction = torch.nn.NLLLoss(ignore_index=-1)
[docs] def get_module(self): """Return the PyTorch module that has been built and is used by this wrapper.""" return self.module
[docs] def prepare_data(self, validationsize=None, file=None): """If file is not None, use the content of the file, ignore the size. If validationsize is > 1, it is the absolute size, if < 1 it is the portion e.g. 0.01 to use.""" # get the validation set if self.is_data_prepared: logger.warning("Called prepare_data after it was already called, doing nothing") return if file is not None: # use the file for validation self.dataset.split(convert=True, validation_file=file) else: if validationsize is not None: validationsize = float(validationsize) valsize = None valpart = None # TODO: allow not using a validation set at all! if validationsize is not None: if validationsize > 1 or validationsize == 0: valsize = validationsize else: valpart = validationsize else: valpart = 0.1 self.dataset.split(convert=True, validation_part=valpart, validation_size=valsize) self.valset = self.dataset.validation_set_converted(as_batch=True) self.is_data_prepared = True
# TODO if we have a validation set, calculate the class distribution here # this should be shown before training starts so the validation accuracy makes more sense # this can also be used to use a loss function that re-weights classes in case of class imbalance! # deps = self.valset[1] # TODO: calculate the class distribution but if sequences, ONLY for the non-padded parts of the sequences!!!! # TODO: this needs to use masking to undo the padding in the results!
[docs] def apply(self, instancelist, converted=False, reshaped=False): """Given a list of instances in original format (or converted if converted=True), applies the model to them in evaluation mode and returns the following: As the first return value, the batch of predictions. This is a list of values (1 value for each instance in the batch) for classification and a list of lists (1 list representing a sequence for each instance in the batch) for sequence tagging. As the second value, returns the score/s for the returned predictions. This has the same shape as the first return value, but returns a score instead of each label. As the third value, returns a batch of confidence/scoring values. For classification, this is a list of lists, where the inner list is the label distribution. For sequence tagging, this is a list of list of lists, again with the label distribution as the inner-most list. Not that the mapping between the index of a value in the label distribution and the label itself can be figured out by the caller by retrieving the target vocab first. This may return additional data in the future or the format of what is returned may change. """ oldlevel = logger.level # logger.setLevel(logging.DEBUG) batchsize = len(instancelist) if not converted: # TODO: check if and when to do instance normalization here! instancelist = [self.dataset.convert_indep(x) for x in instancelist] logger.debug("apply: instances after conversion: %s" % (instancelist,)) if not reshaped: instancelist = self.dataset.reshape_batch(instancelist, indep_only=True) logger.debug("apply: instances after reshaping: %s" % (instancelist,)) preds = self._apply_model(instancelist, train_mode=False) logger.debug("apply: predictions result (shape %s): %s" % (preds.size(), preds,)) # for now we only have classification (sequence/non-sequence) so # for this, we first use the torch max to find the most likely label index, # then convert back to the label itself. We also convert the torch probability vector # into a simple list of values ret = [] nrClasses = self.dataset.nClasses if self.dataset.isSequence: # TODO: create a mask and return actual length sequences, not paddings from the tensor! # (not relevant in cases where the batchsize is only 1) # TODO: make it work for batchsize > 1!!!!! dims = preds.size()[-1] reshaped = preds.view(-1, dims).detach() logger.debug("apply, reshaped=%s" % (reshaped,)) reshaped = torch.exp(reshaped) logger.debug("apply, reshaped-exp=%s" % (reshaped,)) _, out_idxs = torch.max(reshaped, 1) # NOTE/IMPORTANT: we convert all numpy to list since numpy values (even just floats) # cannot get JSON serialized reshaped = reshaped.tolist() # predictions = out_idxs.cpu().numpy().tolist() predictions = out_idxs.tolist() probdists = [list(x) for x in reshaped] logger.debug("apply, probdists=%s" % (probdists,)) logger.debug("apply, predictions=%s" % (predictions,)) logger.debug("apply, predictions type=%s" % (type(predictions),)) # create the list of corresponding labels # TODO: again, this is a shortcut that only works if the batch has only one sequence logger.debug("len(predictions) %s" % (len(predictions),)) #for i in range(len(predictions)): # logger.debug("probdists[%s] %s" % (i, probdists[i],)) # logger.debug("predictions[%s] %s" % (i, predictions[i],)) # logger.debug("probdists[%s][predictions[%s]] %s" % (i, i, probdists[predictions[i]],)) probs = [probdists[i][predictions[i]] for i in range(len(predictions))] labels = [self.dataset.target.idx2label(x) for x in predictions] logger.debug("apply, labels=%s" % (labels,)) logger.debug("apply, probdists=%s" % (probdists,)) logger.debug("apply, probs=%s" % (probs,)) logger.setLevel(oldlevel) # NOTE: currently the above code only works for a single instance and the # variables labels, probs, probdists are all for a single instance, not the batch. # So in order to make the result a batch, enclose each in a list as its single element return [labels], [probs], [probdists] else: # preds should be a 2d tensor of size batchsize x numberClasses assert len(preds.size()) == 2 assert preds.size()[0] == batchsize assert preds.size()[1] == nrClasses probs, out_idxs = torch.max(preds, dim=1) probs = probs.detach().cpu().tolist() # out_idxs contains the class indices, need to convert back to labels getlabel = self.dataset.target.idx2label labels = [getlabel(x) for x in out_idxs] # for each instance in the batch return a list # probs = [list(x) for x in preds] probdists = preds.detach().cpu().tolist() logger.setLevel(oldlevel) ret = labels, probs, probdists return ret
def _apply_model(self, indeps, train_mode=False): """Apply the model to the list of indeps in the correct format for our Pytorch module and returns a list of predictions as Pytorch variables. train_mode influences if the underlying model is used in training mode or not. """ if train_mode and not self.is_data_prepared: raise Exception("Must call train or prepare_data first") curmodeistrain = self.module.training if train_mode and not curmodeistrain: self.module.train() self.module.zero_grad() elif not train_mode and curmodeistrain: self.module.eval() output = self.module(indeps) # logger.debug("Output of model is of size %s: %s" % (output.size(), output, )) if self.module.training == curmodeistrain: self.module.train(curmodeistrain) return output
[docs] def evaluate(self, validationinstances, train_mode=False, as_pytorch=True): """Apply the model to the independent part of the validationset instances and use the dependent part to evaluate the predictions. The validationinstances must be in batch format. Returns a tuple of loss and accuracy. By default this returns the loss as a pyTorch variable and accuracy as a pytorch tensor, if as_pytorch is set to False, returns floats instead. If prepared=True then validationinstances already contains everything as properly prepared PyTorch Variables. """ if not self.is_data_prepared: raise Exception("Must call train or prepare_data first") # NOTE!!! the targets are what we get minus 1, which shifts the padding index to be -1 # TODO: IF we use padded targets, we need to subtract 1 here, otherwise we have to leave this # as is!! targets = np.array(validationinstances[1]) # v_deps = V(torch.LongTensor(targets), requires_grad=False) v_deps = torch.LongTensor(targets) if self._enable_cuda: v_deps = v_deps.cuda() v_preds = self._apply_model(validationinstances[0], train_mode=train_mode) logger.debug("Got v_preds of size %s: %s" % (v_preds.size(), v_preds,)) logger.debug("Evaluating against targets of size %s: %s" % (v_deps.size(), v_deps)) # TODO: not sure if and when to zero the grads for the loss function if we use it # in between training steps? # NOTE: the v_preds may or may not be sequences, if sequences we get the wrong shape here # so for now we simply put all the items (sequences and batch items) in the first dimension valuedim = v_preds.size()[-1] # ORIG: loss = self.lossfunction(v_preds.view(-1, valuedim), v_deps.view(-1)) # TODO: the reduction should be configurable! loss_function = torch.nn.NLLLoss(ignore_index=-1, reduction='elementwise_mean') v_preds_reshape = v_preds.view(-1, valuedim) # !!DEBUG print("Predictions, reshaped, size=", v_preds_reshape.size(), "is", v_preds_reshape, file=sys.stderr) v_deps_reshape = v_deps.view(-1) # !!DEBUG print("Targets, reshaped, size=", v_deps_reshape.size(), "is", v_deps_reshape, file=sys.stderr) loss = loss_function(v_preds_reshape, v_deps_reshape) # calculate the accuracy as well, since we know we have a classification problem acc, correct, total = ModelWrapper.accuracy(v_preds, v_deps) logger.debug("got loss %s accuracy %s" % (loss, acc, )) # print("loss=", loss, "preds=", v_preds, "targets=", v_deps, file=sys.stderr) # !!DEBUG sys.exit() if not as_pytorch: loss = float(loss) acc = float(acc) return tuple((loss, acc, correct, total))
# the implementation should figure out best values if parameter # is set to None # Also, by default, the method should decide which format # to use for reading the data (original or converted)
[docs] def train(self, max_epochs=20, batch_size=20, early_stopping=True, filenameprefix=None ): """Train the model on the dataset. max_epochs is the maximum number of epochs to train, but if early_stopping is enabled, it could be fewer. If early_stopping is True, then a default strategy is used where training stops after the validation accuracy did not improve for 2 epochs. If set to a function that function (which must accept a standard set of parameters and return a boolean) is used. TODO: check if config should be used by default for the batch_size etc here! """ # if this get set to True we bail out of all loops, save the model if necessary and stop training stop_it_already = False # this gets set by the signal handler and has the same effect as stop_it_already self.interrupted = False if early_stopping: if not filenameprefix: raise Exception("If early stopping is specified, filenameprefix is needed") if isinstance(early_stopping, bool): if early_stopping: early_stopping_function = ModelWrapper.early_stopping_checker else: early_stopping = lambda *args, **kwargs : False else: early_stopping_function = early_stopping if not self.is_data_prepared: logger.warning("Invoked train without calling prepare_data first, running default") self.prepare_data() # make sure we are in training mode self.module.train(mode=True) # set the random seed, every module must know how to handle this self.module.set_seed(self.random_seed) # the list of all validation losses so far validation_losses = [] # list of all validation accuracies so far validation_accs = [] # total number of batches processed over all epochs totalbatches = 0 # for calculating loss and acc over a number of batches or instances for reporting report_correct = 0 report_total = 0 report_loss = 0 # best validation accuracy so far # initialize the last epoch number for validation to 1 so we do not validate right away last_epoch = 1 best_acc = 0.0 saved_model_name = None for epoch in range(1, max_epochs+1): # batch number within an epoch batch_nr = 0 # number of instances already used for training during this epoch nr_instances = 0 # for calculating loss and acc over the whole epoch / training set epoch_correct = 0 epoch_total = 0 epoch_loss = 0 for batch in self.dataset.batches_converted(train=True, batch_size=batch_size): batch_nr += 1 totalbatches += 1 nr_instances += batch_size # we should use the actual batch size which could be less self.module.zero_grad() # import ipdb # ipdb.set_trace() (loss, acc, correct, total) = self.evaluate(batch, train_mode=True) logger.debug("Epoch=%s, batch=%s: loss=%s acc=%s" % (epoch, batch_nr, f(loss), f(acc))) loss.backward() report_loss += float(loss) report_correct += float(correct) report_total += float(total) epoch_loss += float(loss) epoch_correct += float(correct) epoch_total += float(total) self.optimizer.step() # evaluation on the training set only for reporting if (self.report_every_batches and ((totalbatches % self.report_every_batches) == 0)) or \ (self.report_every_instances and ((nr_instances % self.report_every_instances) == 0)): logger.info("Epoch=%s, batch=%s, insts=%s: loss=%s acc=%s / epoch_loss=%s epoch_acc=%s" % (epoch, batch_nr, nr_instances, f(report_loss), f(report_correct / report_total), f(epoch_loss), f(epoch_correct / epoch_total))) report_loss = 0 report_correct = 0 report_total = 0 # this is for validating against the validation set and possibly early stopping if (self.validate_every_batches and ((totalbatches % self.validate_every_batches) == 0)) or\ (self.validate_every_epochs and ((epoch - last_epoch) == self.validate_every_epochs)) or \ (self.validate_every_instances and ((nr_instances % self.validate_every_instances) == 0)): # evaluate on validation set last_epoch = epoch (loss_val, acc_val, correct, total) = self.evaluate(self.valset, train_mode=False) logger.info("Epoch=%s, VALIDATION: loss=%s acc=%s" % (epoch, f(loss_val), f(acc_val))) validation_losses.append(float(loss_val)) validation_accs.append(float(acc_val)) # if we have early stopping, check if we should stop if early_stopping: stop_it_already = early_stopping_function( losses=validation_losses, accs=validation_accs) if stop_it_already: logger.info("Early stopping criterion reached, stopping training, best validation acc: %s" % (best_acc,)) # if the current validation accuracy is better than what we had so far, save # the model if acc_val > best_acc: best_acc = acc_val saved_model_name = self.save_model(filenameprefix) self.best_model_saved = True if self.stopfile and os.path.exists(self.stopfile): logger.info("Stop file found, removing and terminating training, best validation acc: %s" % (best_acc,)) os.remove(self.stopfile) stop_it_already = True if stop_it_already or self.interrupted: break if stop_it_already or self.interrupted: self.interrupted = False break logger.info("Training completed, best validation acc={}, model saved to {}".format(best_acc, saved_model_name))
[docs] def checkpoint(self, filenameprefix, checkpointnr=None): """Save the module, adding a checkpoint number in the name.""" # TODO: eventually this should get moved into the module? cp = checkpointnr if cp is None: cp = self.checkpointnr self.checkpointnr += 1 torch.save(self.module, filenameprefix + ".module.pytorch")
[docs] def save_model(self, filenameprefix): start = timeit.timeit() filename = filenameprefix + ".module.pytorch" torch.save(self.module, filename) end = timeit.timeit() logger.info("Saved model to %s in %s" % (filename, f(abs(end - start)))) return filename
[docs] def save(self, filenameprefix): # store everything using pickle, but we do not store the module or the dataset # the dataset will simply get recreated when loading, but the module needs to get saved # separately # only if we did not already save the best model during training for some reason if not self.best_model_saved: self.save_model(filenameprefix) assert hasattr(self, 'metafile') filename = filenameprefix+".wrapper.pickle" with open(filename, "wb") as outf: start = timeit.timeit() pickle.dump(self, outf) end = timeit.timeit() logger.info("Saved wrapper to %s in %s" % (filename, f(abs((end-start)))))
[docs] def init_after_load(self, filenameprefix): self.dataset = Dataset(self.metafile) self.init_from_dataset() self.module = torch.load(filenameprefix+".module.pytorch") self.is_data_prepared = False self.valset = None
def __getstate__(self): """Currently we do not pickle the dataset instance but rather re-create it when loading, and we do not pickle the actual pytorch module but rather use the pytorch-specific saving and loading mechanism.""" # print("DEBUG: self keys=", self.__dict__.keys(), file=sys.stderr) assert hasattr(self, 'metafile') state = self.__dict__.copy() # this creates a shallow copy # print("DEBUG: copy keys=", state.keys(), file=sys.stderr) assert 'metafile' in state # do not save these transient variables: del state['dataset'] del state['module'] del state['valset'] del state['is_data_prepared'] return state def __setstate__(self, state): """We simply restore everything that was pickled earlier, the missing fields then need to get restored using the _init_after_load method (called from load)""" assert 'metafile' in state self.__dict__.update(state) assert hasattr(self, 'metafile') def __repr__(self): repr = "ModelWrapperSimple(config=%r, cuda=%s):\nmodule=%s\noptimizer=%s\nlossfun=%s" % \ (self.config, self._enable_cuda, self.module, self.optimizer, self.lossfunction) return repr