IndexConfig.java

/*
 *  IndexConfig.java
 *
 *  Copyright (c) 2007-2011, The University of Sheffield.
 *
 *  This file is part of GATE Mímir (see http://gate.ac.uk/family/mimir.html), 
 *  and is free software, licenced under the GNU Lesser General Public License,
 *  Version 3, June 2007 (also included with this distribution as file
 *  LICENCE-LGPL3.html).
 *
 * Valentin Tablan, 18 Feb 2009
 *
 *  $Id: IndexConfig.java 17471 2014-02-27 14:48:17Z valyt $
 */
package gate.mimir;

import gate.Gate;
import gate.mimir.index.IndexException;
import it.unimi.di.big.mg4j.index.NullTermProcessor;
import it.unimi.di.big.mg4j.index.TermProcessor;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.net.URL;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;

import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;

import com.thoughtworks.xstream.XStream;
import com.thoughtworks.xstream.io.HierarchicalStreamReader;
import com.thoughtworks.xstream.io.HierarchicalStreamWriter;
import com.thoughtworks.xstream.io.xml.PrettyPrintWriter;
import com.thoughtworks.xstream.io.xml.QNameMap;
import com.thoughtworks.xstream.io.xml.StaxDriver;
import com.thoughtworks.xstream.io.xml.StaxReader;

/**
 * Interface for indexer configurations.
 */
public class IndexConfig implements Serializable {
  
  /**
   * Base class for indexer configs
   */
  public static class IndexerConfig implements Serializable {
    
    /**
     * Serialisation ID.
     */
    private static final long serialVersionUID = -3980825689154182192L;

    public IndexerConfig(boolean directIndexEnabled) {
      super();
      this.directIndexEnabled = directIndexEnabled;
    }

    /**
     * Should a direct index be also built?
     */
    private boolean directIndexEnabled = false;
    
    /**
     * Should a direct index be also built?
     * @return <code>true</code> if a direct index was requested.
     */
    public boolean isDirectIndexEnabled() {
      return directIndexEnabled;
    }

  }
  
  /**
   * Object storing the configuration for a Token indexer.
   */
  public static class TokenIndexerConfig extends IndexerConfig {
    /**
     * Serialisation ID.
     */
    private static final long serialVersionUID = 1868954146230945676L;

    /**
     * The name of the feature on Token annotations that need to be indexed.
     */
    private String featureName;

    /**
     * The term processor to be used for this indexer.
     */
    private TermProcessor termProcessor;

    /**
     * Creates a new TokenIndexerConfig.
     * 
     * @param featureName
     *          the name of the feature (on Token annotations) that needs to be
     *          indexed.
     * @param termProcessor
     *          The {@link TermProcessor} to be used by this indexer. If
     *          <code>null</code> is given, then a {@link NullTermProcessor} is
     *          used.
     * @param directIndexEnabled should a direct index also be built?         
     */
    public TokenIndexerConfig(String featureName, TermProcessor termProcessor, 
                              boolean directIndexEnabled) {
      super(directIndexEnabled);
      this.featureName = featureName;
      this.termProcessor =
              termProcessor == null
                      ? NullTermProcessor.getInstance()
                      : termProcessor;
    }

    /**
     * Obtains the name of the feature (on Token annotations) that needs to be
     * indexed by this token indexer.
     * 
     * @return the featureName
     */
    public String getFeatureName() {
      return featureName;
    }

    /**
     * Obtains the instance of {@link TermProcessor} that needs to be used by
     * this token indexer.
     * 
     * @return the termProcessor
     */
    public TermProcessor getTermProcessor() {
      return termProcessor;
    }
  }

  /**
   * Object storing the configuration for a semantic annotation indexer.
   */
  public static class SemanticIndexerConfig extends IndexerConfig {
    /**
     * Serialisation ID.
     */
    private static final long serialVersionUID = -8714423642897958538L;

    /**
     * The types of the annotation that need to be indexed by this indexer.
     */
    private String[] annotationTypes;

    /**
     * The {@link SemanticAnnotationHelper}s used by this indexer.
     */
    private SemanticAnnotationHelper[] helpers;

    /**
     * Creates a SemanticIndexerConfig. The two arrays given as parameters must
     * have the same length, the helper at a given position in the helpers array
     * is used to index the annotations with the type at the same position in
     * the annotationTypes array.
     * 
     * @param annotationTypes
     *          the types of the annotations that need to be indexed by this
     *          indexer.
     * @param helper
     *          the {@link SemanticAnnotationHelper}s used by this indexer.
     * @param directIndexEnabled should a direct index also be built?         
     */
    public SemanticIndexerConfig(String[] annotationTypes,
            SemanticAnnotationHelper[] helpers, boolean directIndexEnabled) {
      super(directIndexEnabled);
      this.annotationTypes = annotationTypes;
      this.helpers = helpers;
    }

    /**
     * Gets the types of annotations indexed by this indexer.
     * 
     * @return the annotationTypes
     */
    public String[] getAnnotationTypes() {
      return annotationTypes;
    }

    /**
     * Gets the {@link SemanticAnnotationHelper}s used to index annotations.
     * 
     * @return the helpers
     */
    public SemanticAnnotationHelper[] getHelpers() {
      return helpers;
    }
  }

  /**
   * 
   */
  private static final long serialVersionUID = -8127630936829037489L;
  
  /**
   * The current format version for the XML files containing serialisations of 
   * IndexConfig instances.
   * Version numbers:
   * <dl>
   * <dt>4</dt><dd>First version number used. Indexes previous to this did not 
   * save their version.</dd>
   * <dt>5</dt><dd>Mimir indexes are now built with MG4J-big (64 bits).</dd>
   * <dt>6</dt><dd>Added support for direct indexes.</dd>
   * <dt>7</dt><dd>Mímir 5.0 live index.</dt>
   * <dt>8</dt><dd>Mímir 5.6 with upgraded MG4J dependencies.</dt>
   * </dl>
   */
  public static final int FORMAT_VERSION = 8;

  /**
   * The default feature name for obtaining document URIs (provided as features
   * on documents).
   */
  public static final String DOCUMENT_URI_FEATURE_DEFAULT_NAME =
          "gate.mimir.uri";
  
  /**
   * The default value for {@link #timeBetweenBatches} (1 hour).
   */
  public static final int DEFAULT_TIME_BETWEEN_BATCHES = 3600 * 1000;
  
  
  /**
   * The default value for {@link #maximumBatches}
   */
  public static final int DEFAULT_MAXIMUM_BATCHES = 20;
  
  /**
   * A Map storing values that need to be passed between the various pluggable
   * components used by this index (e.g. ORDI-based annotation helpers may
   * pass references to the ORDI Factory between each other). 
   */
  private transient Map<String, Object> context;
  

  
  /**
   * Gets the map used for passing values between the various pluggable elements
   * in this index (such as annotation helpers). The returned map is live, 
   * meaning that all changes made to it are available to all other clients 
   * requesting it.    
   * @return a {@link Map}, with {@link String} keys and arbitrary values. 
   */
  public Map<String, Object> getContext() {
    // lazy creation
    if(context == null) {
      context = Collections.synchronizedMap(new HashMap<String, Object>());
    }
    return context;
  }

  /**
   * Constructs an index configuration object.
   * 
   * @param indexDirectory
   *          indexDirectory the top level directory to be used for storing the
   *          index.
   * @param tokenAnnotationSetName
   *          the name for the annotation set where token annotations can be
   *          found. Use <tt>null</tt> for the default annotation set.
   * @param tokenAnnotationType
   *          the type of annotations used as tokens.
   * @param semanticAnnotationSetName
   *          the name for the annotation set where semantic annotations should
   *          be collected from.
   * @param tokenIndexers
   *          an array of {@link TokenIndexerConfig} values, describing the
   *          configuration for the indexing of each token feature.
   * @param semanticIndexers
   *          an array of {@link SemanticIndexerConfig} values, describing the
   *          the configuration for indexing semantic annotations.
   */
  public IndexConfig(File indexDirectory, String tokenAnnotationSetName,
          String tokenAnnotationType, String semanticAnnotationSetName,
          TokenIndexerConfig[] tokenIndexers,
          SemanticIndexerConfig[] semanticIndexers,
          DocumentMetadataHelper[] docMetadataHelpers,
          DocumentRenderer documentRenderer) {
    
    this.indexDirectory = indexDirectory;
    this.formatVersion = FORMAT_VERSION;
    this.tokenAnnotationSetName = tokenAnnotationSetName;
    this.tokenAnnotationType = tokenAnnotationType;
    this.tokenIndexers = tokenIndexers;
    this.semanticAnnotationSetName = semanticAnnotationSetName;
    this.semanticIndexers = semanticIndexers;
    this.docMetadataHelpers = docMetadataHelpers;
    this.documentRenderer = documentRenderer;
    this.options = new HashMap<String, String>();
  }

  
  
  /**
   * @return the formatVersion See {@link #FORMAT_VERSION}.
   */
  public int getFormatVersion() {
    return formatVersion;
  }

  /**
   * See {@link #FORMAT_VERSION}.
   * @param formatVersion the formatVersion to set
   */
  public void setFormatVersion(int formatVersion) {
    this.formatVersion = formatVersion;
  }

  /**
   * Gets the top level directory of an index.
   * 
   * @return a {@link File} object.
   */
  public File getIndexDirectory() {
    return indexDirectory;
  }

  /**
   * Gets the annotation type to be used for obtaining tokens.
   * 
   * @return an {@link String} object.
   */
  public String getTokenAnnotationType() {
    return tokenAnnotationType;
  }

  /**
   * Gets the name for the annotation set where token annotations can be found.
   * 
   * @return the tokenAnnotationSet
   */
  public String getTokenAnnotationSetName() {
    return tokenAnnotationSetName;
  }

  /**
   * Gets the configuration for all the token indexers used.
   * 
   * @return an array of {@link TokenIndexerConfig} values.
   */
  public TokenIndexerConfig[] getTokenIndexers() {
    return tokenIndexers;
  }

  /**
   * Gets the name of the annotation set containing semantic annotations.
   * 
   * @return the semanticAnnotationSetName
   */
  public String getSemanticAnnotationSetName() {
    return semanticAnnotationSetName;
  }

  /**
   * Gets the configuration for all the semantic annotation indexers used.
   * 
   * @return an array of {@link SemanticIndexerConfig} values.
   */
  public SemanticIndexerConfig[] getSemanticIndexers() {
    return semanticIndexers;
  }
  
  /**
   * Gets the current value for the time interval (in milliseconds) between the 
   * saving of a batch and the next. This is the maximum interval documents 
   * submitted for indexing are kept in RAM (and are thus not searcheable).
   * 
   * Defaults to {@value #DEFAULT_TIME_BETWEEN_BATCHES}.
   * @return
   */
  public int getTimeBetweenBatches() {
    return timeBetweenBatches;
  }

  /**
   * Sets the current value for the time interval (in milliseconds) between the 
   * saving of a batch and the next. This is the maximum interval documents 
   * submitted for indexing are kept in RAM (and are thus not searcheable). 
   * 
   * Defaults to {@value #DEFAULT_TIME_BETWEEN_BATCHES}.
   */  
  public void setTimeBetweenBatches(int timeBetweenBatches) {
    this.timeBetweenBatches = timeBetweenBatches;
  }

  /**
   * Gets the maximum number of on-disk index batches before an index compaction
   * is triggered.
   * 
   * Defaults to {@value #DEFAULT_MAXIMUM_BATCHES}.
   * @return
   */
  public int getMaximumBatches() {
    return maximumBatches;
  }

  
  /**
   * Sets the maximum number of on-disk index batches before an index compaction
   * is triggered.
   * 
   * Defaults to {@link #DEFAULT_MAXIMUM_BATCHES}.
   * @param maximumBatches
   */
  public void setMaximumBatches(int maximumBatches) {
    this.maximumBatches = maximumBatches;
  }

  /**
   * Gets the options map - a Map with arbitrary configuration options, which 
   * is made available to all sub-elements of this index (e.g. the various 
   * annotation helpers).  
   */
  public Map<String, String> getOptions() {
    return options;
  }

  /**
   * Gets the renderer to be used for displaying documents and hits.
   * 
   * @return the documentRenderer
   */
  public DocumentRenderer getDocumentRenderer() {
    return documentRenderer;
  }

  /**
   * Sets the renderer to be used for displaying documents and hits.
   * 
   * @param documentRenderer
   *          the documentRenderer to set
   */
  public void setDocumentRenderer(DocumentRenderer documentRenderer) {
    this.documentRenderer = documentRenderer;
  }

  /**
   * Gets the array of document metadata helpers.
   * 
   * @return the docMetadataHelpers
   */
  public DocumentMetadataHelper[] getDocMetadataHelpers() {
    return docMetadataHelpers;
  }

  /**
   * @return the documentUriFeatureName
   */
  public String getDocumentUriFeatureName() {
    return documentUriFeatureName;
  }

  /**
   * @param documentUriFeatureName
   *          the documentUriFeatureName to set
   */
  public void setDocumentUriFeatureName(String documentUriFeatureName) {
    this.documentUriFeatureName = documentUriFeatureName;
  }

  /**
   * Creates an XStream object suitable for loading and saving Mimir index
   * configurations.
   */
  private static XStream newXStream() {
    XStream xs = new XStream(new StaxDriver());
    xs.setClassLoader(Gate.getClassLoader());
    xs.alias("indexConfig", IndexConfig.class);
    xs.alias("tokenIndexer", TokenIndexerConfig.class);
    xs.alias("semanticIndexer", SemanticIndexerConfig.class);
    // when loading old indexes, add the '.big.'
    xs.aliasPackage("it.unimi.dsi.mg4j", "it.unimi.di.big.mg4j");
    // when loading pre-5.0 indexes, replace the package name
    xs.aliasPackage("it.unimi.dsi.big.mg4j", "it.unimi.di.big.mg4j");
    return xs;
  }

  /**
   * Saves an {@link IndexConfig} object to a file via XML serialisation.
   * 
   * @param config
   *          the object to be saved.
   * @param file
   *          the file to write to.
   * @throws IOException
   */
  public static void writeConfigToFile(IndexConfig config, File file)
          throws IOException {
    XStream xstream = newXStream();
    FileWriter fileWriter = new FileWriter(file);
    HierarchicalStreamWriter xmlWriter = new PrettyPrintWriter(fileWriter);
    xstream.marshal(config, xmlWriter);
  }

  /**
   * Loads an index config object from a file. The file should have been created
   * using the {@link #writeConfigToFile(IndexConfig, File)} method.
   * 
   * @param file
   *          the file to read.
   * @return an {@link IndexConfig} object.
   * @throws IOException
   *           if the provided config file cannot be found.
   * @throws IndexException
   *           if the parsing of the config file fails.
   */
  public static IndexConfig readConfigFromFile(File file) throws IOException,
          IndexException {
    return readConfigFromUrl(file.toURI().toURL());
  }

  /**
   * Loads an index config object from a URL. The file should have been created
   * using the {@link #writeConfigToFile(IndexConfig, File)} method.
   * 
   * @param u
   *          the URL to read.
   * @return an {@link IndexConfig} object.
   * @throws IOException
   *           if the provided config file cannot be found.
   * @throws IndexException
   *           if the parsing of the config file fails.
   */
  public static IndexConfig readConfigFromUrl(URL u) throws IOException,
          IndexException {
    try {
      XMLInputFactory inputFactory = XMLInputFactory.newInstance();
      InputStream configStream = new BufferedInputStream(u.openStream()); 
      XMLStreamReader xsr =
              inputFactory.createXMLStreamReader(configStream);
      HierarchicalStreamReader xmlReader = new StaxReader(new QNameMap(), xsr);
      try {
        IndexConfig theConfig = (IndexConfig)newXStream().unmarshal(xmlReader);
        // check the version number
        if(theConfig.formatVersion > FORMAT_VERSION){
          throw new UnsupportedOperationException(
            "The version of the IndexConfig at \"" + u.toExternalForm() + 
            "\" is greater than the maximum supported version by this Mímir " +
            "implementation (" + theConfig.formatVersion + " > " + FORMAT_VERSION +
            ").");
        }
        return theConfig;
      } finally {
        xmlReader.close();
        configStream.close();
      }
    } catch(XMLStreamException e) {
      throw new IndexException("Exception while reading config from " + u, e);
    }
  }

  /**
   * Loads an index config object from a file, but allows the caller to override
   * the index directory stored in the file. This is useful if the index was
   * created on one machine but is being used on another.
   * 
   * @param configFile
   *          the file to read
   * @param indexDir
   *          the top-level index directory, which will be used instead of the
   *          value stored in the config file.
   * @throws FileNotFoundException
   *           if the provided config file cannot be found.
   * @throws IndexException
   *           if the parsing of the config file fails.
   */
  public static IndexConfig readConfigFromFile(File configFile, File indexDir)
          throws IOException, IndexException {
    IndexConfig conf = readConfigFromFile(configFile);
    // indexDirectory is private but this method is inside the IndexConfig
    // class so this assignment is legal.
    conf.indexDirectory = indexDir;
    return conf;
  }

  /**
   * Loads an index config object from a URL, but allows the caller to override
   * the index directory stored in the file. This is useful if the index was
   * created on one machine but is being used on another.
   * 
   * @param configFile
   *          the file to read
   * @param indexDir
   *          the top-level index directory, which will be used instead of the
   *          value stored in the config file.
   * @throws FileNotFoundException
   *           if the provided config file cannot be found.
   * @throws IndexException
   *           if the parsing of the config file fails.
   */
  public static IndexConfig readConfigFromUrl(URL configFile, File indexDir)
          throws IOException, IndexException {
    IndexConfig conf = readConfigFromUrl(configFile);
    // indexDirectory is private but this method is inside the IndexConfig
    // class so this assignment is legal.
    conf.indexDirectory = indexDir;
    return conf;
  }

  /**
   * The top level directory of the index.
   */
  private File indexDirectory;

  /**
   * The format version for this index config instance.
   */
  private int formatVersion;
  
  /**
   * The annotation type used for tokens.
   */
  private String tokenAnnotationType;

  /**
   * The annotation set where token annotations can be found.
   */
  private String tokenAnnotationSetName;

  /**
   * The configuration for all the token indexers used.
   */
  private TokenIndexerConfig[] tokenIndexers;

  /**
   * The configuration for all the semantic indexers used.
   */
  private SemanticIndexerConfig[] semanticIndexers;

  /**
   * The helpers used for generating document metadata.
   */
  private DocumentMetadataHelper[] docMetadataHelpers;

  /**
   * The document renderer used to render documents and hits.
   */
  private DocumentRenderer documentRenderer;

  /**
   * The name of the annotation set containing the semantic annotations
   */
  private String semanticAnnotationSetName;

  /**
   * The name for the document feature containing the document URI. Defaults to
   * {@link #DOCUMENT_URI_FEATURE_DEFAULT_NAME}.
   */
  private String documentUriFeatureName = DOCUMENT_URI_FEATURE_DEFAULT_NAME;
  
  
  /**
   * The maximum amount of time between dumping batches to disk, i.e. the 
   * maximum amount of time a document may be stored in RAM after having been 
   * submitted for indexing and before it becomes searchable. 
   */
  private int timeBetweenBatches = DEFAULT_TIME_BETWEEN_BATCHES;
  
  
  /**
   * The maximum number of constituent batches in any atomic index before a 
   * compact operation is triggered. 
   */
  private int maximumBatches = DEFAULT_MAXIMUM_BATCHES;
  
  /**
   * A Map with arbitrary configuration options, which is made available to all
   * sub-elements of this index (e.g. the various annotation helpers).  
   */
  private Map<String, String> options;
}