IndexConfig.java

  1. /*
  2.  *  IndexConfig.java
  3.  *
  4.  *  Copyright (c) 2007-2011, The University of Sheffield.
  5.  *
  6.  *  This file is part of GATE Mímir (see http://gate.ac.uk/family/mimir.html),
  7.  *  and is free software, licenced under the GNU Lesser General Public License,
  8.  *  Version 3, June 2007 (also included with this distribution as file
  9.  *  LICENCE-LGPL3.html).
  10.  *
  11.  * Valentin Tablan, 18 Feb 2009
  12.  *
  13.  *  $Id: IndexConfig.java 17471 2014-02-27 14:48:17Z valyt $
  14.  */
  15. package gate.mimir;

  16. import gate.Gate;
  17. import gate.mimir.index.IndexException;
  18. import it.unimi.di.big.mg4j.index.NullTermProcessor;
  19. import it.unimi.di.big.mg4j.index.TermProcessor;

  20. import java.io.BufferedInputStream;
  21. import java.io.File;
  22. import java.io.FileNotFoundException;
  23. import java.io.FileWriter;
  24. import java.io.IOException;
  25. import java.io.InputStream;
  26. import java.io.Serializable;
  27. import java.net.URL;
  28. import java.util.Collections;
  29. import java.util.HashMap;
  30. import java.util.Map;

  31. import javax.xml.stream.XMLInputFactory;
  32. import javax.xml.stream.XMLStreamException;
  33. import javax.xml.stream.XMLStreamReader;

  34. import com.thoughtworks.xstream.XStream;
  35. import com.thoughtworks.xstream.io.HierarchicalStreamReader;
  36. import com.thoughtworks.xstream.io.HierarchicalStreamWriter;
  37. import com.thoughtworks.xstream.io.xml.PrettyPrintWriter;
  38. import com.thoughtworks.xstream.io.xml.QNameMap;
  39. import com.thoughtworks.xstream.io.xml.StaxDriver;
  40. import com.thoughtworks.xstream.io.xml.StaxReader;

  41. /**
  42.  * Interface for indexer configurations.
  43.  */
  44. public class IndexConfig implements Serializable {
  45.  
  46.   /**
  47.    * Base class for indexer configs
  48.    */
  49.   public static class IndexerConfig implements Serializable {
  50.    
  51.     /**
  52.      * Serialisation ID.
  53.      */
  54.     private static final long serialVersionUID = -3980825689154182192L;

  55.     public IndexerConfig(boolean directIndexEnabled) {
  56.       super();
  57.       this.directIndexEnabled = directIndexEnabled;
  58.     }

  59.     /**
  60.      * Should a direct index be also built?
  61.      */
  62.     private boolean directIndexEnabled = false;
  63.    
  64.     /**
  65.      * Should a direct index be also built?
  66.      * @return <code>true</code> if a direct index was requested.
  67.      */
  68.     public boolean isDirectIndexEnabled() {
  69.       return directIndexEnabled;
  70.     }

  71.   }
  72.  
  73.   /**
  74.    * Object storing the configuration for a Token indexer.
  75.    */
  76.   public static class TokenIndexerConfig extends IndexerConfig {
  77.     /**
  78.      * Serialisation ID.
  79.      */
  80.     private static final long serialVersionUID = 1868954146230945676L;

  81.     /**
  82.      * The name of the feature on Token annotations that need to be indexed.
  83.      */
  84.     private String featureName;

  85.     /**
  86.      * The term processor to be used for this indexer.
  87.      */
  88.     private TermProcessor termProcessor;

  89.     /**
  90.      * Creates a new TokenIndexerConfig.
  91.      *
  92.      * @param featureName
  93.      *          the name of the feature (on Token annotations) that needs to be
  94.      *          indexed.
  95.      * @param termProcessor
  96.      *          The {@link TermProcessor} to be used by this indexer. If
  97.      *          <code>null</code> is given, then a {@link NullTermProcessor} is
  98.      *          used.
  99.      * @param directIndexEnabled should a direct index also be built?        
  100.      */
  101.     public TokenIndexerConfig(String featureName, TermProcessor termProcessor,
  102.                               boolean directIndexEnabled) {
  103.       super(directIndexEnabled);
  104.       this.featureName = featureName;
  105.       this.termProcessor =
  106.               termProcessor == null
  107.                       ? NullTermProcessor.getInstance()
  108.                       : termProcessor;
  109.     }

  110.     /**
  111.      * Obtains the name of the feature (on Token annotations) that needs to be
  112.      * indexed by this token indexer.
  113.      *
  114.      * @return the featureName
  115.      */
  116.     public String getFeatureName() {
  117.       return featureName;
  118.     }

  119.     /**
  120.      * Obtains the instance of {@link TermProcessor} that needs to be used by
  121.      * this token indexer.
  122.      *
  123.      * @return the termProcessor
  124.      */
  125.     public TermProcessor getTermProcessor() {
  126.       return termProcessor;
  127.     }
  128.   }

  129.   /**
  130.    * Object storing the configuration for a semantic annotation indexer.
  131.    */
  132.   public static class SemanticIndexerConfig extends IndexerConfig {
  133.     /**
  134.      * Serialisation ID.
  135.      */
  136.     private static final long serialVersionUID = -8714423642897958538L;

  137.     /**
  138.      * The types of the annotation that need to be indexed by this indexer.
  139.      */
  140.     private String[] annotationTypes;

  141.     /**
  142.      * The {@link SemanticAnnotationHelper}s used by this indexer.
  143.      */
  144.     private SemanticAnnotationHelper[] helpers;

  145.     /**
  146.      * Creates a SemanticIndexerConfig. The two arrays given as parameters must
  147.      * have the same length, the helper at a given position in the helpers array
  148.      * is used to index the annotations with the type at the same position in
  149.      * the annotationTypes array.
  150.      *
  151.      * @param annotationTypes
  152.      *          the types of the annotations that need to be indexed by this
  153.      *          indexer.
  154.      * @param helper
  155.      *          the {@link SemanticAnnotationHelper}s used by this indexer.
  156.      * @param directIndexEnabled should a direct index also be built?        
  157.      */
  158.     public SemanticIndexerConfig(String[] annotationTypes,
  159.             SemanticAnnotationHelper[] helpers, boolean directIndexEnabled) {
  160.       super(directIndexEnabled);
  161.       this.annotationTypes = annotationTypes;
  162.       this.helpers = helpers;
  163.     }

  164.     /**
  165.      * Gets the types of annotations indexed by this indexer.
  166.      *
  167.      * @return the annotationTypes
  168.      */
  169.     public String[] getAnnotationTypes() {
  170.       return annotationTypes;
  171.     }

  172.     /**
  173.      * Gets the {@link SemanticAnnotationHelper}s used to index annotations.
  174.      *
  175.      * @return the helpers
  176.      */
  177.     public SemanticAnnotationHelper[] getHelpers() {
  178.       return helpers;
  179.     }
  180.   }

  181.   /**
  182.    *
  183.    */
  184.   private static final long serialVersionUID = -8127630936829037489L;
  185.  
  186.   /**
  187.    * The current format version for the XML files containing serialisations of
  188.    * IndexConfig instances.
  189.    * Version numbers:
  190.    * <dl>
  191.    * <dt>4</dt><dd>First version number used. Indexes previous to this did not
  192.    * save their version.</dd>
  193.    * <dt>5</dt><dd>Mimir indexes are now built with MG4J-big (64 bits).</dd>
  194.    * <dt>6</dt><dd>Added support for direct indexes.</dd>
  195.    * <dt>7</dt><dd>Mímir 5.0 live index.</dt>
  196.    * <dt>8</dt><dd>Mímir 5.6 with upgraded MG4J dependencies.</dt>
  197.    * </dl>
  198.    */
  199.   public static final int FORMAT_VERSION = 8;

  200.   /**
  201.    * The default feature name for obtaining document URIs (provided as features
  202.    * on documents).
  203.    */
  204.   public static final String DOCUMENT_URI_FEATURE_DEFAULT_NAME =
  205.           "gate.mimir.uri";
  206.  
  207.   /**
  208.    * The default value for {@link #timeBetweenBatches} (1 hour).
  209.    */
  210.   public static final int DEFAULT_TIME_BETWEEN_BATCHES = 3600 * 1000;
  211.  
  212.  
  213.   /**
  214.    * The default value for {@link #maximumBatches}
  215.    */
  216.   public static final int DEFAULT_MAXIMUM_BATCHES = 20;
  217.  
  218.   /**
  219.    * A Map storing values that need to be passed between the various pluggable
  220.    * components used by this index (e.g. ORDI-based annotation helpers may
  221.    * pass references to the ORDI Factory between each other).
  222.    */
  223.   private transient Map<String, Object> context;
  224.  

  225.  
  226.   /**
  227.    * Gets the map used for passing values between the various pluggable elements
  228.    * in this index (such as annotation helpers). The returned map is live,
  229.    * meaning that all changes made to it are available to all other clients
  230.    * requesting it.    
  231.    * @return a {@link Map}, with {@link String} keys and arbitrary values.
  232.    */
  233.   public Map<String, Object> getContext() {
  234.     // lazy creation
  235.     if(context == null) {
  236.       context = Collections.synchronizedMap(new HashMap<String, Object>());
  237.     }
  238.     return context;
  239.   }

  240.   /**
  241.    * Constructs an index configuration object.
  242.    *
  243.    * @param indexDirectory
  244.    *          indexDirectory the top level directory to be used for storing the
  245.    *          index.
  246.    * @param tokenAnnotationSetName
  247.    *          the name for the annotation set where token annotations can be
  248.    *          found. Use <tt>null</tt> for the default annotation set.
  249.    * @param tokenAnnotationType
  250.    *          the type of annotations used as tokens.
  251.    * @param semanticAnnotationSetName
  252.    *          the name for the annotation set where semantic annotations should
  253.    *          be collected from.
  254.    * @param tokenIndexers
  255.    *          an array of {@link TokenIndexerConfig} values, describing the
  256.    *          configuration for the indexing of each token feature.
  257.    * @param semanticIndexers
  258.    *          an array of {@link SemanticIndexerConfig} values, describing the
  259.    *          the configuration for indexing semantic annotations.
  260.    */
  261.   public IndexConfig(File indexDirectory, String tokenAnnotationSetName,
  262.           String tokenAnnotationType, String semanticAnnotationSetName,
  263.           TokenIndexerConfig[] tokenIndexers,
  264.           SemanticIndexerConfig[] semanticIndexers,
  265.           DocumentMetadataHelper[] docMetadataHelpers,
  266.           DocumentRenderer documentRenderer) {
  267.    
  268.     this.indexDirectory = indexDirectory;
  269.     this.formatVersion = FORMAT_VERSION;
  270.     this.tokenAnnotationSetName = tokenAnnotationSetName;
  271.     this.tokenAnnotationType = tokenAnnotationType;
  272.     this.tokenIndexers = tokenIndexers;
  273.     this.semanticAnnotationSetName = semanticAnnotationSetName;
  274.     this.semanticIndexers = semanticIndexers;
  275.     this.docMetadataHelpers = docMetadataHelpers;
  276.     this.documentRenderer = documentRenderer;
  277.     this.options = new HashMap<String, String>();
  278.   }

  279.  
  280.  
  281.   /**
  282.    * @return the formatVersion See {@link #FORMAT_VERSION}.
  283.    */
  284.   public int getFormatVersion() {
  285.     return formatVersion;
  286.   }

  287.   /**
  288.    * See {@link #FORMAT_VERSION}.
  289.    * @param formatVersion the formatVersion to set
  290.    */
  291.   public void setFormatVersion(int formatVersion) {
  292.     this.formatVersion = formatVersion;
  293.   }

  294.   /**
  295.    * Gets the top level directory of an index.
  296.    *
  297.    * @return a {@link File} object.
  298.    */
  299.   public File getIndexDirectory() {
  300.     return indexDirectory;
  301.   }

  302.   /**
  303.    * Gets the annotation type to be used for obtaining tokens.
  304.    *
  305.    * @return an {@link String} object.
  306.    */
  307.   public String getTokenAnnotationType() {
  308.     return tokenAnnotationType;
  309.   }

  310.   /**
  311.    * Gets the name for the annotation set where token annotations can be found.
  312.    *
  313.    * @return the tokenAnnotationSet
  314.    */
  315.   public String getTokenAnnotationSetName() {
  316.     return tokenAnnotationSetName;
  317.   }

  318.   /**
  319.    * Gets the configuration for all the token indexers used.
  320.    *
  321.    * @return an array of {@link TokenIndexerConfig} values.
  322.    */
  323.   public TokenIndexerConfig[] getTokenIndexers() {
  324.     return tokenIndexers;
  325.   }

  326.   /**
  327.    * Gets the name of the annotation set containing semantic annotations.
  328.    *
  329.    * @return the semanticAnnotationSetName
  330.    */
  331.   public String getSemanticAnnotationSetName() {
  332.     return semanticAnnotationSetName;
  333.   }

  334.   /**
  335.    * Gets the configuration for all the semantic annotation indexers used.
  336.    *
  337.    * @return an array of {@link SemanticIndexerConfig} values.
  338.    */
  339.   public SemanticIndexerConfig[] getSemanticIndexers() {
  340.     return semanticIndexers;
  341.   }
  342.  
  343.   /**
  344.    * Gets the current value for the time interval (in milliseconds) between the
  345.    * saving of a batch and the next. This is the maximum interval documents
  346.    * submitted for indexing are kept in RAM (and are thus not searcheable).
  347.    *
  348.    * Defaults to {@value #DEFAULT_TIME_BETWEEN_BATCHES}.
  349.    * @return
  350.    */
  351.   public int getTimeBetweenBatches() {
  352.     return timeBetweenBatches;
  353.   }

  354.   /**
  355.    * Sets the current value for the time interval (in milliseconds) between the
  356.    * saving of a batch and the next. This is the maximum interval documents
  357.    * submitted for indexing are kept in RAM (and are thus not searcheable).
  358.    *
  359.    * Defaults to {@value #DEFAULT_TIME_BETWEEN_BATCHES}.
  360.    */  
  361.   public void setTimeBetweenBatches(int timeBetweenBatches) {
  362.     this.timeBetweenBatches = timeBetweenBatches;
  363.   }

  364.   /**
  365.    * Gets the maximum number of on-disk index batches before an index compaction
  366.    * is triggered.
  367.    *
  368.    * Defaults to {@value #DEFAULT_MAXIMUM_BATCHES}.
  369.    * @return
  370.    */
  371.   public int getMaximumBatches() {
  372.     return maximumBatches;
  373.   }

  374.  
  375.   /**
  376.    * Sets the maximum number of on-disk index batches before an index compaction
  377.    * is triggered.
  378.    *
  379.    * Defaults to {@link #DEFAULT_MAXIMUM_BATCHES}.
  380.    * @param maximumBatches
  381.    */
  382.   public void setMaximumBatches(int maximumBatches) {
  383.     this.maximumBatches = maximumBatches;
  384.   }

  385.   /**
  386.    * Gets the options map - a Map with arbitrary configuration options, which
  387.    * is made available to all sub-elements of this index (e.g. the various
  388.    * annotation helpers).  
  389.    */
  390.   public Map<String, String> getOptions() {
  391.     return options;
  392.   }

  393.   /**
  394.    * Gets the renderer to be used for displaying documents and hits.
  395.    *
  396.    * @return the documentRenderer
  397.    */
  398.   public DocumentRenderer getDocumentRenderer() {
  399.     return documentRenderer;
  400.   }

  401.   /**
  402.    * Sets the renderer to be used for displaying documents and hits.
  403.    *
  404.    * @param documentRenderer
  405.    *          the documentRenderer to set
  406.    */
  407.   public void setDocumentRenderer(DocumentRenderer documentRenderer) {
  408.     this.documentRenderer = documentRenderer;
  409.   }

  410.   /**
  411.    * Gets the array of document metadata helpers.
  412.    *
  413.    * @return the docMetadataHelpers
  414.    */
  415.   public DocumentMetadataHelper[] getDocMetadataHelpers() {
  416.     return docMetadataHelpers;
  417.   }

  418.   /**
  419.    * @return the documentUriFeatureName
  420.    */
  421.   public String getDocumentUriFeatureName() {
  422.     return documentUriFeatureName;
  423.   }

  424.   /**
  425.    * @param documentUriFeatureName
  426.    *          the documentUriFeatureName to set
  427.    */
  428.   public void setDocumentUriFeatureName(String documentUriFeatureName) {
  429.     this.documentUriFeatureName = documentUriFeatureName;
  430.   }

  431.   /**
  432.    * Creates an XStream object suitable for loading and saving Mimir index
  433.    * configurations.
  434.    */
  435.   private static XStream newXStream() {
  436.     XStream xs = new XStream(new StaxDriver());
  437.     xs.setClassLoader(Gate.getClassLoader());
  438.     xs.alias("indexConfig", IndexConfig.class);
  439.     xs.alias("tokenIndexer", TokenIndexerConfig.class);
  440.     xs.alias("semanticIndexer", SemanticIndexerConfig.class);
  441.     // when loading old indexes, add the '.big.'
  442.     xs.aliasPackage("it.unimi.dsi.mg4j", "it.unimi.di.big.mg4j");
  443.     // when loading pre-5.0 indexes, replace the package name
  444.     xs.aliasPackage("it.unimi.dsi.big.mg4j", "it.unimi.di.big.mg4j");
  445.     return xs;
  446.   }

  447.   /**
  448.    * Saves an {@link IndexConfig} object to a file via XML serialisation.
  449.    *
  450.    * @param config
  451.    *          the object to be saved.
  452.    * @param file
  453.    *          the file to write to.
  454.    * @throws IOException
  455.    */
  456.   public static void writeConfigToFile(IndexConfig config, File file)
  457.           throws IOException {
  458.     XStream xstream = newXStream();
  459.     FileWriter fileWriter = new FileWriter(file);
  460.     HierarchicalStreamWriter xmlWriter = new PrettyPrintWriter(fileWriter);
  461.     xstream.marshal(config, xmlWriter);
  462.   }

  463.   /**
  464.    * Loads an index config object from a file. The file should have been created
  465.    * using the {@link #writeConfigToFile(IndexConfig, File)} method.
  466.    *
  467.    * @param file
  468.    *          the file to read.
  469.    * @return an {@link IndexConfig} object.
  470.    * @throws IOException
  471.    *           if the provided config file cannot be found.
  472.    * @throws IndexException
  473.    *           if the parsing of the config file fails.
  474.    */
  475.   public static IndexConfig readConfigFromFile(File file) throws IOException,
  476.           IndexException {
  477.     return readConfigFromUrl(file.toURI().toURL());
  478.   }

  479.   /**
  480.    * Loads an index config object from a URL. The file should have been created
  481.    * using the {@link #writeConfigToFile(IndexConfig, File)} method.
  482.    *
  483.    * @param u
  484.    *          the URL to read.
  485.    * @return an {@link IndexConfig} object.
  486.    * @throws IOException
  487.    *           if the provided config file cannot be found.
  488.    * @throws IndexException
  489.    *           if the parsing of the config file fails.
  490.    */
  491.   public static IndexConfig readConfigFromUrl(URL u) throws IOException,
  492.           IndexException {
  493.     try {
  494.       XMLInputFactory inputFactory = XMLInputFactory.newInstance();
  495.       InputStream configStream = new BufferedInputStream(u.openStream());
  496.       XMLStreamReader xsr =
  497.               inputFactory.createXMLStreamReader(configStream);
  498.       HierarchicalStreamReader xmlReader = new StaxReader(new QNameMap(), xsr);
  499.       try {
  500.         IndexConfig theConfig = (IndexConfig)newXStream().unmarshal(xmlReader);
  501.         // check the version number
  502.         if(theConfig.formatVersion > FORMAT_VERSION){
  503.           throw new UnsupportedOperationException(
  504.             "The version of the IndexConfig at \"" + u.toExternalForm() +
  505.             "\" is greater than the maximum supported version by this Mímir " +
  506.             "implementation (" + theConfig.formatVersion + " > " + FORMAT_VERSION +
  507.             ").");
  508.         }
  509.         return theConfig;
  510.       } finally {
  511.         xmlReader.close();
  512.         configStream.close();
  513.       }
  514.     } catch(XMLStreamException e) {
  515.       throw new IndexException("Exception while reading config from " + u, e);
  516.     }
  517.   }

  518.   /**
  519.    * Loads an index config object from a file, but allows the caller to override
  520.    * the index directory stored in the file. This is useful if the index was
  521.    * created on one machine but is being used on another.
  522.    *
  523.    * @param configFile
  524.    *          the file to read
  525.    * @param indexDir
  526.    *          the top-level index directory, which will be used instead of the
  527.    *          value stored in the config file.
  528.    * @throws FileNotFoundException
  529.    *           if the provided config file cannot be found.
  530.    * @throws IndexException
  531.    *           if the parsing of the config file fails.
  532.    */
  533.   public static IndexConfig readConfigFromFile(File configFile, File indexDir)
  534.           throws IOException, IndexException {
  535.     IndexConfig conf = readConfigFromFile(configFile);
  536.     // indexDirectory is private but this method is inside the IndexConfig
  537.     // class so this assignment is legal.
  538.     conf.indexDirectory = indexDir;
  539.     return conf;
  540.   }

  541.   /**
  542.    * Loads an index config object from a URL, but allows the caller to override
  543.    * the index directory stored in the file. This is useful if the index was
  544.    * created on one machine but is being used on another.
  545.    *
  546.    * @param configFile
  547.    *          the file to read
  548.    * @param indexDir
  549.    *          the top-level index directory, which will be used instead of the
  550.    *          value stored in the config file.
  551.    * @throws FileNotFoundException
  552.    *           if the provided config file cannot be found.
  553.    * @throws IndexException
  554.    *           if the parsing of the config file fails.
  555.    */
  556.   public static IndexConfig readConfigFromUrl(URL configFile, File indexDir)
  557.           throws IOException, IndexException {
  558.     IndexConfig conf = readConfigFromUrl(configFile);
  559.     // indexDirectory is private but this method is inside the IndexConfig
  560.     // class so this assignment is legal.
  561.     conf.indexDirectory = indexDir;
  562.     return conf;
  563.   }

  564.   /**
  565.    * The top level directory of the index.
  566.    */
  567.   private File indexDirectory;

  568.   /**
  569.    * The format version for this index config instance.
  570.    */
  571.   private int formatVersion;
  572.  
  573.   /**
  574.    * The annotation type used for tokens.
  575.    */
  576.   private String tokenAnnotationType;

  577.   /**
  578.    * The annotation set where token annotations can be found.
  579.    */
  580.   private String tokenAnnotationSetName;

  581.   /**
  582.    * The configuration for all the token indexers used.
  583.    */
  584.   private TokenIndexerConfig[] tokenIndexers;

  585.   /**
  586.    * The configuration for all the semantic indexers used.
  587.    */
  588.   private SemanticIndexerConfig[] semanticIndexers;

  589.   /**
  590.    * The helpers used for generating document metadata.
  591.    */
  592.   private DocumentMetadataHelper[] docMetadataHelpers;

  593.   /**
  594.    * The document renderer used to render documents and hits.
  595.    */
  596.   private DocumentRenderer documentRenderer;

  597.   /**
  598.    * The name of the annotation set containing the semantic annotations
  599.    */
  600.   private String semanticAnnotationSetName;

  601.   /**
  602.    * The name for the document feature containing the document URI. Defaults to
  603.    * {@link #DOCUMENT_URI_FEATURE_DEFAULT_NAME}.
  604.    */
  605.   private String documentUriFeatureName = DOCUMENT_URI_FEATURE_DEFAULT_NAME;
  606.  
  607.  
  608.   /**
  609.    * The maximum amount of time between dumping batches to disk, i.e. the
  610.    * maximum amount of time a document may be stored in RAM after having been
  611.    * submitted for indexing and before it becomes searchable.
  612.    */
  613.   private int timeBetweenBatches = DEFAULT_TIME_BETWEEN_BATCHES;
  614.  
  615.  
  616.   /**
  617.    * The maximum number of constituent batches in any atomic index before a
  618.    * compact operation is triggered.
  619.    */
  620.   private int maximumBatches = DEFAULT_MAXIMUM_BATCHES;
  621.  
  622.   /**
  623.    * A Map with arbitrary configuration options, which is made available to all
  624.    * sub-elements of this index (e.g. the various annotation helpers).  
  625.    */
  626.   private Map<String, String> options;
  627. }