IndexConfig.java
- /*
- * IndexConfig.java
- *
- * Copyright (c) 2007-2011, The University of Sheffield.
- *
- * This file is part of GATE Mímir (see http://gate.ac.uk/family/mimir.html),
- * and is free software, licenced under the GNU Lesser General Public License,
- * Version 3, June 2007 (also included with this distribution as file
- * LICENCE-LGPL3.html).
- *
- * Valentin Tablan, 18 Feb 2009
- *
- * $Id: IndexConfig.java 17471 2014-02-27 14:48:17Z valyt $
- */
- package gate.mimir;
- import gate.Gate;
- import gate.mimir.index.IndexException;
- import it.unimi.di.big.mg4j.index.NullTermProcessor;
- import it.unimi.di.big.mg4j.index.TermProcessor;
- import java.io.BufferedInputStream;
- import java.io.File;
- import java.io.FileNotFoundException;
- import java.io.FileWriter;
- import java.io.IOException;
- import java.io.InputStream;
- import java.io.Serializable;
- import java.net.URL;
- import java.util.Collections;
- import java.util.HashMap;
- import java.util.Map;
- import javax.xml.stream.XMLInputFactory;
- import javax.xml.stream.XMLStreamException;
- import javax.xml.stream.XMLStreamReader;
- import com.thoughtworks.xstream.XStream;
- import com.thoughtworks.xstream.io.HierarchicalStreamReader;
- import com.thoughtworks.xstream.io.HierarchicalStreamWriter;
- import com.thoughtworks.xstream.io.xml.PrettyPrintWriter;
- import com.thoughtworks.xstream.io.xml.QNameMap;
- import com.thoughtworks.xstream.io.xml.StaxDriver;
- import com.thoughtworks.xstream.io.xml.StaxReader;
- /**
- * Interface for indexer configurations.
- */
- public class IndexConfig implements Serializable {
-
- /**
- * Base class for indexer configs
- */
- public static class IndexerConfig implements Serializable {
-
- /**
- * Serialisation ID.
- */
- private static final long serialVersionUID = -3980825689154182192L;
- public IndexerConfig(boolean directIndexEnabled) {
- super();
- this.directIndexEnabled = directIndexEnabled;
- }
- /**
- * Should a direct index be also built?
- */
- private boolean directIndexEnabled = false;
-
- /**
- * Should a direct index be also built?
- * @return <code>true</code> if a direct index was requested.
- */
- public boolean isDirectIndexEnabled() {
- return directIndexEnabled;
- }
- }
-
- /**
- * Object storing the configuration for a Token indexer.
- */
- public static class TokenIndexerConfig extends IndexerConfig {
- /**
- * Serialisation ID.
- */
- private static final long serialVersionUID = 1868954146230945676L;
- /**
- * The name of the feature on Token annotations that need to be indexed.
- */
- private String featureName;
- /**
- * The term processor to be used for this indexer.
- */
- private TermProcessor termProcessor;
- /**
- * Creates a new TokenIndexerConfig.
- *
- * @param featureName
- * the name of the feature (on Token annotations) that needs to be
- * indexed.
- * @param termProcessor
- * The {@link TermProcessor} to be used by this indexer. If
- * <code>null</code> is given, then a {@link NullTermProcessor} is
- * used.
- * @param directIndexEnabled should a direct index also be built?
- */
- public TokenIndexerConfig(String featureName, TermProcessor termProcessor,
- boolean directIndexEnabled) {
- super(directIndexEnabled);
- this.featureName = featureName;
- this.termProcessor =
- termProcessor == null
- ? NullTermProcessor.getInstance()
- : termProcessor;
- }
- /**
- * Obtains the name of the feature (on Token annotations) that needs to be
- * indexed by this token indexer.
- *
- * @return the featureName
- */
- public String getFeatureName() {
- return featureName;
- }
- /**
- * Obtains the instance of {@link TermProcessor} that needs to be used by
- * this token indexer.
- *
- * @return the termProcessor
- */
- public TermProcessor getTermProcessor() {
- return termProcessor;
- }
- }
- /**
- * Object storing the configuration for a semantic annotation indexer.
- */
- public static class SemanticIndexerConfig extends IndexerConfig {
- /**
- * Serialisation ID.
- */
- private static final long serialVersionUID = -8714423642897958538L;
- /**
- * The types of the annotation that need to be indexed by this indexer.
- */
- private String[] annotationTypes;
- /**
- * The {@link SemanticAnnotationHelper}s used by this indexer.
- */
- private SemanticAnnotationHelper[] helpers;
- /**
- * Creates a SemanticIndexerConfig. The two arrays given as parameters must
- * have the same length, the helper at a given position in the helpers array
- * is used to index the annotations with the type at the same position in
- * the annotationTypes array.
- *
- * @param annotationTypes
- * the types of the annotations that need to be indexed by this
- * indexer.
- * @param helper
- * the {@link SemanticAnnotationHelper}s used by this indexer.
- * @param directIndexEnabled should a direct index also be built?
- */
- public SemanticIndexerConfig(String[] annotationTypes,
- SemanticAnnotationHelper[] helpers, boolean directIndexEnabled) {
- super(directIndexEnabled);
- this.annotationTypes = annotationTypes;
- this.helpers = helpers;
- }
- /**
- * Gets the types of annotations indexed by this indexer.
- *
- * @return the annotationTypes
- */
- public String[] getAnnotationTypes() {
- return annotationTypes;
- }
- /**
- * Gets the {@link SemanticAnnotationHelper}s used to index annotations.
- *
- * @return the helpers
- */
- public SemanticAnnotationHelper[] getHelpers() {
- return helpers;
- }
- }
- /**
- *
- */
- private static final long serialVersionUID = -8127630936829037489L;
-
- /**
- * The current format version for the XML files containing serialisations of
- * IndexConfig instances.
- * Version numbers:
- * <dl>
- * <dt>4</dt><dd>First version number used. Indexes previous to this did not
- * save their version.</dd>
- * <dt>5</dt><dd>Mimir indexes are now built with MG4J-big (64 bits).</dd>
- * <dt>6</dt><dd>Added support for direct indexes.</dd>
- * <dt>7</dt><dd>Mímir 5.0 live index.</dt>
- * <dt>8</dt><dd>Mímir 5.6 with upgraded MG4J dependencies.</dt>
- * </dl>
- */
- public static final int FORMAT_VERSION = 8;
- /**
- * The default feature name for obtaining document URIs (provided as features
- * on documents).
- */
- public static final String DOCUMENT_URI_FEATURE_DEFAULT_NAME =
- "gate.mimir.uri";
-
- /**
- * The default value for {@link #timeBetweenBatches} (1 hour).
- */
- public static final int DEFAULT_TIME_BETWEEN_BATCHES = 3600 * 1000;
-
-
- /**
- * The default value for {@link #maximumBatches}
- */
- public static final int DEFAULT_MAXIMUM_BATCHES = 20;
-
- /**
- * A Map storing values that need to be passed between the various pluggable
- * components used by this index (e.g. ORDI-based annotation helpers may
- * pass references to the ORDI Factory between each other).
- */
- private transient Map<String, Object> context;
-
-
- /**
- * Gets the map used for passing values between the various pluggable elements
- * in this index (such as annotation helpers). The returned map is live,
- * meaning that all changes made to it are available to all other clients
- * requesting it.
- * @return a {@link Map}, with {@link String} keys and arbitrary values.
- */
- public Map<String, Object> getContext() {
- // lazy creation
- if(context == null) {
- context = Collections.synchronizedMap(new HashMap<String, Object>());
- }
- return context;
- }
- /**
- * Constructs an index configuration object.
- *
- * @param indexDirectory
- * indexDirectory the top level directory to be used for storing the
- * index.
- * @param tokenAnnotationSetName
- * the name for the annotation set where token annotations can be
- * found. Use <tt>null</tt> for the default annotation set.
- * @param tokenAnnotationType
- * the type of annotations used as tokens.
- * @param semanticAnnotationSetName
- * the name for the annotation set where semantic annotations should
- * be collected from.
- * @param tokenIndexers
- * an array of {@link TokenIndexerConfig} values, describing the
- * configuration for the indexing of each token feature.
- * @param semanticIndexers
- * an array of {@link SemanticIndexerConfig} values, describing the
- * the configuration for indexing semantic annotations.
- */
- public IndexConfig(File indexDirectory, String tokenAnnotationSetName,
- String tokenAnnotationType, String semanticAnnotationSetName,
- TokenIndexerConfig[] tokenIndexers,
- SemanticIndexerConfig[] semanticIndexers,
- DocumentMetadataHelper[] docMetadataHelpers,
- DocumentRenderer documentRenderer) {
-
- this.indexDirectory = indexDirectory;
- this.formatVersion = FORMAT_VERSION;
- this.tokenAnnotationSetName = tokenAnnotationSetName;
- this.tokenAnnotationType = tokenAnnotationType;
- this.tokenIndexers = tokenIndexers;
- this.semanticAnnotationSetName = semanticAnnotationSetName;
- this.semanticIndexers = semanticIndexers;
- this.docMetadataHelpers = docMetadataHelpers;
- this.documentRenderer = documentRenderer;
- this.options = new HashMap<String, String>();
- }
-
-
- /**
- * @return the formatVersion See {@link #FORMAT_VERSION}.
- */
- public int getFormatVersion() {
- return formatVersion;
- }
- /**
- * See {@link #FORMAT_VERSION}.
- * @param formatVersion the formatVersion to set
- */
- public void setFormatVersion(int formatVersion) {
- this.formatVersion = formatVersion;
- }
- /**
- * Gets the top level directory of an index.
- *
- * @return a {@link File} object.
- */
- public File getIndexDirectory() {
- return indexDirectory;
- }
- /**
- * Gets the annotation type to be used for obtaining tokens.
- *
- * @return an {@link String} object.
- */
- public String getTokenAnnotationType() {
- return tokenAnnotationType;
- }
- /**
- * Gets the name for the annotation set where token annotations can be found.
- *
- * @return the tokenAnnotationSet
- */
- public String getTokenAnnotationSetName() {
- return tokenAnnotationSetName;
- }
- /**
- * Gets the configuration for all the token indexers used.
- *
- * @return an array of {@link TokenIndexerConfig} values.
- */
- public TokenIndexerConfig[] getTokenIndexers() {
- return tokenIndexers;
- }
- /**
- * Gets the name of the annotation set containing semantic annotations.
- *
- * @return the semanticAnnotationSetName
- */
- public String getSemanticAnnotationSetName() {
- return semanticAnnotationSetName;
- }
- /**
- * Gets the configuration for all the semantic annotation indexers used.
- *
- * @return an array of {@link SemanticIndexerConfig} values.
- */
- public SemanticIndexerConfig[] getSemanticIndexers() {
- return semanticIndexers;
- }
-
- /**
- * Gets the current value for the time interval (in milliseconds) between the
- * saving of a batch and the next. This is the maximum interval documents
- * submitted for indexing are kept in RAM (and are thus not searcheable).
- *
- * Defaults to {@value #DEFAULT_TIME_BETWEEN_BATCHES}.
- * @return
- */
- public int getTimeBetweenBatches() {
- return timeBetweenBatches;
- }
- /**
- * Sets the current value for the time interval (in milliseconds) between the
- * saving of a batch and the next. This is the maximum interval documents
- * submitted for indexing are kept in RAM (and are thus not searcheable).
- *
- * Defaults to {@value #DEFAULT_TIME_BETWEEN_BATCHES}.
- */
- public void setTimeBetweenBatches(int timeBetweenBatches) {
- this.timeBetweenBatches = timeBetweenBatches;
- }
- /**
- * Gets the maximum number of on-disk index batches before an index compaction
- * is triggered.
- *
- * Defaults to {@value #DEFAULT_MAXIMUM_BATCHES}.
- * @return
- */
- public int getMaximumBatches() {
- return maximumBatches;
- }
-
- /**
- * Sets the maximum number of on-disk index batches before an index compaction
- * is triggered.
- *
- * Defaults to {@link #DEFAULT_MAXIMUM_BATCHES}.
- * @param maximumBatches
- */
- public void setMaximumBatches(int maximumBatches) {
- this.maximumBatches = maximumBatches;
- }
- /**
- * Gets the options map - a Map with arbitrary configuration options, which
- * is made available to all sub-elements of this index (e.g. the various
- * annotation helpers).
- */
- public Map<String, String> getOptions() {
- return options;
- }
- /**
- * Gets the renderer to be used for displaying documents and hits.
- *
- * @return the documentRenderer
- */
- public DocumentRenderer getDocumentRenderer() {
- return documentRenderer;
- }
- /**
- * Sets the renderer to be used for displaying documents and hits.
- *
- * @param documentRenderer
- * the documentRenderer to set
- */
- public void setDocumentRenderer(DocumentRenderer documentRenderer) {
- this.documentRenderer = documentRenderer;
- }
- /**
- * Gets the array of document metadata helpers.
- *
- * @return the docMetadataHelpers
- */
- public DocumentMetadataHelper[] getDocMetadataHelpers() {
- return docMetadataHelpers;
- }
- /**
- * @return the documentUriFeatureName
- */
- public String getDocumentUriFeatureName() {
- return documentUriFeatureName;
- }
- /**
- * @param documentUriFeatureName
- * the documentUriFeatureName to set
- */
- public void setDocumentUriFeatureName(String documentUriFeatureName) {
- this.documentUriFeatureName = documentUriFeatureName;
- }
- /**
- * Creates an XStream object suitable for loading and saving Mimir index
- * configurations.
- */
- private static XStream newXStream() {
- XStream xs = new XStream(new StaxDriver());
- xs.setClassLoader(Gate.getClassLoader());
- xs.alias("indexConfig", IndexConfig.class);
- xs.alias("tokenIndexer", TokenIndexerConfig.class);
- xs.alias("semanticIndexer", SemanticIndexerConfig.class);
- // when loading old indexes, add the '.big.'
- xs.aliasPackage("it.unimi.dsi.mg4j", "it.unimi.di.big.mg4j");
- // when loading pre-5.0 indexes, replace the package name
- xs.aliasPackage("it.unimi.dsi.big.mg4j", "it.unimi.di.big.mg4j");
- return xs;
- }
- /**
- * Saves an {@link IndexConfig} object to a file via XML serialisation.
- *
- * @param config
- * the object to be saved.
- * @param file
- * the file to write to.
- * @throws IOException
- */
- public static void writeConfigToFile(IndexConfig config, File file)
- throws IOException {
- XStream xstream = newXStream();
- FileWriter fileWriter = new FileWriter(file);
- HierarchicalStreamWriter xmlWriter = new PrettyPrintWriter(fileWriter);
- xstream.marshal(config, xmlWriter);
- }
- /**
- * Loads an index config object from a file. The file should have been created
- * using the {@link #writeConfigToFile(IndexConfig, File)} method.
- *
- * @param file
- * the file to read.
- * @return an {@link IndexConfig} object.
- * @throws IOException
- * if the provided config file cannot be found.
- * @throws IndexException
- * if the parsing of the config file fails.
- */
- public static IndexConfig readConfigFromFile(File file) throws IOException,
- IndexException {
- return readConfigFromUrl(file.toURI().toURL());
- }
- /**
- * Loads an index config object from a URL. The file should have been created
- * using the {@link #writeConfigToFile(IndexConfig, File)} method.
- *
- * @param u
- * the URL to read.
- * @return an {@link IndexConfig} object.
- * @throws IOException
- * if the provided config file cannot be found.
- * @throws IndexException
- * if the parsing of the config file fails.
- */
- public static IndexConfig readConfigFromUrl(URL u) throws IOException,
- IndexException {
- try {
- XMLInputFactory inputFactory = XMLInputFactory.newInstance();
- InputStream configStream = new BufferedInputStream(u.openStream());
- XMLStreamReader xsr =
- inputFactory.createXMLStreamReader(configStream);
- HierarchicalStreamReader xmlReader = new StaxReader(new QNameMap(), xsr);
- try {
- IndexConfig theConfig = (IndexConfig)newXStream().unmarshal(xmlReader);
- // check the version number
- if(theConfig.formatVersion > FORMAT_VERSION){
- throw new UnsupportedOperationException(
- "The version of the IndexConfig at \"" + u.toExternalForm() +
- "\" is greater than the maximum supported version by this Mímir " +
- "implementation (" + theConfig.formatVersion + " > " + FORMAT_VERSION +
- ").");
- }
- return theConfig;
- } finally {
- xmlReader.close();
- configStream.close();
- }
- } catch(XMLStreamException e) {
- throw new IndexException("Exception while reading config from " + u, e);
- }
- }
- /**
- * Loads an index config object from a file, but allows the caller to override
- * the index directory stored in the file. This is useful if the index was
- * created on one machine but is being used on another.
- *
- * @param configFile
- * the file to read
- * @param indexDir
- * the top-level index directory, which will be used instead of the
- * value stored in the config file.
- * @throws FileNotFoundException
- * if the provided config file cannot be found.
- * @throws IndexException
- * if the parsing of the config file fails.
- */
- public static IndexConfig readConfigFromFile(File configFile, File indexDir)
- throws IOException, IndexException {
- IndexConfig conf = readConfigFromFile(configFile);
- // indexDirectory is private but this method is inside the IndexConfig
- // class so this assignment is legal.
- conf.indexDirectory = indexDir;
- return conf;
- }
- /**
- * Loads an index config object from a URL, but allows the caller to override
- * the index directory stored in the file. This is useful if the index was
- * created on one machine but is being used on another.
- *
- * @param configFile
- * the file to read
- * @param indexDir
- * the top-level index directory, which will be used instead of the
- * value stored in the config file.
- * @throws FileNotFoundException
- * if the provided config file cannot be found.
- * @throws IndexException
- * if the parsing of the config file fails.
- */
- public static IndexConfig readConfigFromUrl(URL configFile, File indexDir)
- throws IOException, IndexException {
- IndexConfig conf = readConfigFromUrl(configFile);
- // indexDirectory is private but this method is inside the IndexConfig
- // class so this assignment is legal.
- conf.indexDirectory = indexDir;
- return conf;
- }
- /**
- * The top level directory of the index.
- */
- private File indexDirectory;
- /**
- * The format version for this index config instance.
- */
- private int formatVersion;
-
- /**
- * The annotation type used for tokens.
- */
- private String tokenAnnotationType;
- /**
- * The annotation set where token annotations can be found.
- */
- private String tokenAnnotationSetName;
- /**
- * The configuration for all the token indexers used.
- */
- private TokenIndexerConfig[] tokenIndexers;
- /**
- * The configuration for all the semantic indexers used.
- */
- private SemanticIndexerConfig[] semanticIndexers;
- /**
- * The helpers used for generating document metadata.
- */
- private DocumentMetadataHelper[] docMetadataHelpers;
- /**
- * The document renderer used to render documents and hits.
- */
- private DocumentRenderer documentRenderer;
- /**
- * The name of the annotation set containing the semantic annotations
- */
- private String semanticAnnotationSetName;
- /**
- * The name for the document feature containing the document URI. Defaults to
- * {@link #DOCUMENT_URI_FEATURE_DEFAULT_NAME}.
- */
- private String documentUriFeatureName = DOCUMENT_URI_FEATURE_DEFAULT_NAME;
-
-
- /**
- * The maximum amount of time between dumping batches to disk, i.e. the
- * maximum amount of time a document may be stored in RAM after having been
- * submitted for indexing and before it becomes searchable.
- */
- private int timeBetweenBatches = DEFAULT_TIME_BETWEEN_BATCHES;
-
-
- /**
- * The maximum number of constituent batches in any atomic index before a
- * compact operation is triggered.
- */
- private int maximumBatches = DEFAULT_MAXIMUM_BATCHES;
-
- /**
- * A Map with arbitrary configuration options, which is made available to all
- * sub-elements of this index (e.g. the various annotation helpers).
- */
- private Map<String, String> options;
- }