QueryEngine.java
/*
* QueryEngine.java
*
* Copyright (c) 2007-2011, The University of Sheffield.
*
* This file is part of GATE MÃmir (see http://gate.ac.uk/family/mimir.html),
* and is free software, licenced under the GNU Lesser General Public License,
* Version 3, June 2007 (also included with this distribution as file
* LICENCE-LGPL3.html).
*
* Valentin Tablan, 04 Mar 2009
*
* $Id: QueryEngine.java 17261 2014-01-30 14:05:14Z valyt $
*/
package gate.mimir.search;
import gate.LanguageAnalyser;
import gate.mimir.DocumentMetadataHelper;
import gate.mimir.DocumentRenderer;
import gate.mimir.IndexConfig;
import gate.mimir.IndexConfig.SemanticIndexerConfig;
import gate.mimir.MimirIndex;
import gate.mimir.SemanticAnnotationHelper;
import gate.mimir.index.AtomicAnnotationIndex;
import gate.mimir.index.AtomicTokenIndex;
import gate.mimir.index.DocumentData;
import gate.mimir.index.IndexException;
import gate.mimir.search.query.AnnotationQuery;
import gate.mimir.search.query.Binding;
import gate.mimir.search.query.QueryExecutor;
import gate.mimir.search.query.QueryNode;
import gate.mimir.search.query.parser.ParseException;
import gate.mimir.search.query.parser.QueryParser;
import gate.mimir.search.score.MimirScorer;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.Executor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* This class represents the entry point to the Mimir search API.
*/
public class QueryEngine {
/**
* Represents the type of index that should be searched. Mimir uses two types
* of indexes: token indexes (which index the text input) and annotation
* indexes (which index semantic annotations).
*/
public static enum IndexType{
/**
* Value representing token indexes, used for the document text.
*/
TOKENS,
/**
* Value representing annotation indexes, used for the document semantic
* annotations.
*/
ANNOTATIONS
}
/**
* The maximum size of an index that can be loaded in memory (by default 64
* MB).
*/
public static final long MAX_IN_MEMORY_INDEX = 64 * 1024 * 1024;
/**
* The default value for the document block size.
* @see #setDocumentBlockSize(int)
*/
public static final int DEFAULT_DOCUMENT_BLOCK_SIZE = 1000;
/**
* The index being searched.
*/
protected final MimirIndex index;
/**
* The index configuration this index was built from.
*/
protected IndexConfig indexConfig;
/**
* Should sub-bindings be generated when searching?
*/
protected boolean subBindingsEnabled;
/**
* A callable that produces new {@link MimirScorer} instances on request.
*/
protected Callable<MimirScorer> scorerSource;
protected static final Logger logger = LoggerFactory.getLogger(QueryEngine.class);
/**
* The tokeniser (technically any GATE LA) used to split the text segments
* found in queries into individual tokens. The same tokeniser used to create
* the indexed documents should be used here. If this value is not set, then a
* default ANNIE tokeniser will be used.
*/
protected LanguageAnalyser queryTokeniser;
/**
* The executor used to run tasks for query execution. If the value is not
* set, then new threads are created as needed.
*/
protected Executor executor;
/**
* How many documents get ranked in one ranking stage.
*/
private int documentBlockSize = DEFAULT_DOCUMENT_BLOCK_SIZE;
/**
* A list of currently active QueryRunners. This is used to close all active
* runners when the query engine itself is closed (thus releasing all open
* files).
*/
private List<QueryRunner> activeQueryRunners;
/**
* Are sub-bindings used in this query engine. Sub-bindings are used to
* associate sub-queries with segments of the returned hits. This can be
* useful for showing high-level details about the returned hits. By default,
* sub-bindings are not used.
*
* @return the subBindingsEnabled
*/
public boolean isSubBindingsEnabled() {
return subBindingsEnabled;
}
/**
* @param subBindingsEnabled
* the subBindingsEnabled to set
*/
public void setSubBindingsEnabled(boolean subBindingsEnabled) {
this.subBindingsEnabled = subBindingsEnabled;
}
/**
* Gets the configuration parameter specifying the number of documents that
* get processed as a block. This is used to optimise the search
* process by limiting the number of results that get calculated by default.
* @return
*/
public int getDocumentBlockSize() {
return documentBlockSize;
}
/**
* Sets the configuration parameter specifying the number of documents that
* get processed in one go (e.g. the number of documents that get ranked when
* enumerating results). This is used to optimise the search
* process by limiting the number of results that get calculated by default.
* Defaults to {@link #DEFAULT_DOCUMENT_BLOCK_SIZE}.
* @param documentBlockSize
*/
public void setDocumentBlockSize(int documentBlockSize) {
this.documentBlockSize = documentBlockSize;
}
/**
* Gets the current source of scorers.
* @see #setScorerSource(Callable)
* @return
*/
public Callable<MimirScorer> getScorerSource() {
return scorerSource;
}
/**
* Provides a {@link Callable} that the Query Engine can use for obtaining
* new instances of {@link MimirScorer} to be used for ranking new queries.
* @param scorerSource
*/
public void setScorerSource(Callable<MimirScorer> scorerSource) {
this.scorerSource = scorerSource;
}
/**
* Gets the executor used by this query engine.
*
* @return an executor that can be used for running tasks pertinent to this
* QueryEngine.
*/
public Executor getExecutor() {
return executor;
}
/**
* Sets the {@link Executor} used for executing tasks required for running
* queries. This allows the use of some type thread pooling, is needed. If
* this value is not set, then new threads are created as required.
*
* @param executor
*/
public void setExecutor(Executor executor) {
this.executor = executor;
}
/**
* Sets the tokeniser (technically any GATE analyser) used to split the text
* segments found in queries into individual tokens. The same tokeniser used
* to create the indexed documents should be used here. If this value is not
* set, then a default ANNIE tokeniser will be used.
*
* @param queryTokeniser
* the new tokeniser to be used for parsing queries.
*/
public void setQueryTokeniser(LanguageAnalyser queryTokeniser) {
this.queryTokeniser = queryTokeniser;
}
/**
* Finds the location for a given sub-index in the arrays returned by
* {@link #getIndexes()} and {@link #getDirectIndexes()}.
* @param indexType the IndexType of the requested sub-index (tokens or
* annotations).
* @param indexName the "name" of the requested sub-index (the
* indexed feature name for {@link IndexType#TOKENS} indexes, or the
* annotation type in the case of {@link IndexType#ANNOTATIONS} indexes).
* @return the position in the indexes array for the requested index, or -1 if
* the requested index does not exist.
*/
public int getSubIndexPosition(IndexType indexType, String indexName) {
if(indexType == IndexType.TOKENS) {
for(int i = 0; i < indexConfig.getTokenIndexers().length; i++) {
if(indexConfig.getTokenIndexers()[i].getFeatureName().equals(indexName)) {
return i;
}
}
return -1;
} else if(indexType == IndexType.ANNOTATIONS) {
for(int i = 0; i < indexConfig.getSemanticIndexers().length; i++) {
for(String aType :
indexConfig.getSemanticIndexers()[i].getAnnotationTypes()) {
if(aType.equals(indexName)) {
return indexConfig.getTokenIndexers().length + i;
}
}
}
return -1;
} else {
throw new IllegalArgumentException(
"Don't understand sub-indexes of type " + indexType);
}
}
/**
* Returns the index that stores the data for a particular feature of token
* annotations.
*
* @param featureName
* @return
*/
public AtomicTokenIndex getTokenIndex(String featureName) {
return index.getTokenIndex(featureName);
}
/**
* Returns the index that stores the data for a particular semantic annotation
* type.
*
* @param annotationType
* @return
*/
public AtomicAnnotationIndex getAnnotationIndex(String annotationType) {
return index.getAnnotationIndex(annotationType);
}
public SemanticAnnotationHelper getAnnotationHelper(String annotationType) {
for(int i = 0; i < indexConfig.getSemanticIndexers().length; i++) {
String[] annTypes = indexConfig.getSemanticIndexers()[i]
.getAnnotationTypes();
for(int j = 0; j < annTypes.length; j++) {
if(annTypes[j].equals(annotationType)) {
return indexConfig.getSemanticIndexers()[i].getHelpers()[j];
}
}
}
return null;
}
/**
* Gets the index this query engine is searching.
* @return
*/
public MimirIndex getIndex() {
return index;
}
/**
* @return the index configuration for this index
*/
public IndexConfig getIndexConfig() {
return indexConfig;
}
/**
* Constructs a new query engine for a {@link MimirIndex}.
* @param index the index to be searched.
*/
public QueryEngine(MimirIndex index) {
this.index = index;
this.indexConfig = index.getIndexConfig();
activeQueryRunners = Collections.synchronizedList(
new ArrayList<QueryRunner>());
subBindingsEnabled = false;
}
// /**
// * Constructs a new {@link QueryEngine} for a specified Mimir index. The mimir
// * semantic repository will be initialized using the default location in the
// * filesystem, provided by the IndexConfig
// *
// * @param indexDir
// * the directory containing an index.
// * @throws IndexException
// * if there are problems while opening the indexes.
// */
// public QueryEngine(File indexDir) throws gate.mimir.index.IndexException {
// // read the index config
// try {
// indexConfig =
// IndexConfig.readConfigFromFile(new File(indexDir,
// Indexer.INDEX_CONFIG_FILENAME), indexDir);
// initMG4J();
// // initialise the semantic indexers
// if(indexConfig.getSemanticIndexers() != null &&
// indexConfig.getSemanticIndexers().length > 0) {
// for(SemanticIndexerConfig sic : indexConfig.getSemanticIndexers()){
// for(SemanticAnnotationHelper sah : sic.getHelpers()){
// sah.init(this);
// if(sah.getMode() == SemanticAnnotationHelper.Mode.DOCUMENT &&
// documentSizes == null) {
// // we need to load the document sizes from a token index
// documentSizes = getIndexes()[0].getIndex().sizes;
// }
// }
// }
// }
//
//
// activeQueryRunners = Collections.synchronizedList(
// new ArrayList<QueryRunner>());
// } catch(FileNotFoundException e) {
// throw new IndexException("File not found!", e);
// } catch(IOException e) {
// throw new IndexException("Input/output exception!", e);
// }
// subBindingsEnabled = false;
//
// }
/**
* Get the {@link SemanticAnnotationHelper} corresponding to a query's
* annotation type.
* @throws IllegalArgumentException if the annotation helper for this
* type cannot be found.
*/
public SemanticAnnotationHelper getAnnotationHelper(AnnotationQuery query) {
for(SemanticIndexerConfig semConfig : indexConfig.getSemanticIndexers()){
for(int i = 0; i < semConfig.getAnnotationTypes().length; i++){
if(query.getAnnotationType().equals(
semConfig.getAnnotationTypes()[i])){
return semConfig.getHelpers()[i];
}
}
}
throw new IllegalArgumentException("Semantic annotation type \""
+ query.getAnnotationType() + "\" not known to this query engine.");
}
/**
* Obtains a query executor for a given {@link QueryNode}.
*
* @param query
* the query to be executed.
* @return a {@link QueryExecutor} for the provided query, running over the
* indexes in this query engine.
* @throws IOException
* if the index files cannot be accessed.
*/
public QueryRunner getQueryRunner(QueryNode query) throws IOException {
logger.info("Executing query: " + query.toString());
QueryExecutor qExecutor = query.getQueryExecutor(this);
QueryRunner qRunner;
MimirScorer scorer = null;
try {
scorer = scorerSource == null ? null : scorerSource.call();
} catch(Exception e) {
logger.error("Could not obtain a scorer. Running query unranked.", e);
}
qRunner = new RankingQueryRunnerImpl(qExecutor, scorer);
activeQueryRunners.add(qRunner);
return qRunner;
}
/**
* Notifies the QueryEngine that the given QueryRunner has been closed.
* @param qRunner
*/
public void releaseQueryRunner(QueryRunner qRunner) {
activeQueryRunners.remove(qRunner);
}
/**
* Obtains a query executor for a given query, expressed as a String.
*
* @param query
* the query to be executed.
* @return a {@link QueryExecutor} for the provided query, running over the
* indexes in this query engine.
* @throws IOException
* if the index files cannot be accessed.
* @throws ParseException
* if the string provided for the query cannot be parsed.
*/
public QueryRunner getQueryRunner(String query) throws IOException,
ParseException {
logger.info("Executing query: " + query.toString());
QueryNode qNode =
(queryTokeniser == null) ? QueryParser.parse(query) : QueryParser
.parse(query, queryTokeniser);
return getQueryRunner(qNode);
}
/**
* Obtains the document text for a given search hit.
*
* @param hit
* the search hit for which the text is sought.
* @param leftContext
* the number of tokens to the left of the hit to be included in the
* result.
* @param rightContext
* the number of tokens to the right of the hit to be included in the
* result.
* @return an array of arrays of {@link String}s, representing the tokens and
* spaces at the location of the search hit. The first element of the
* array is an array of tokens, the second element contains the
* spaces.The first element of each array corresponds to the first
* token of the left context.
* @throws IOException
*/
public String[][] getHitText(Binding hit, int leftContext, int rightContext)
throws IndexException {
return getText(hit.getDocumentId(), hit.getTermPosition() - leftContext,
leftContext + hit.getLength() + rightContext);
}
/**
* Gets the text covered by a given binding.
*
* @param hit
* the binding.
* @return an array of two string arrays, the first representing the tokens
* covered by the binding and the second the spaces after each token.
* @throws IOException
*/
public String[][] getHitText(Binding hit) throws IndexException {
return getText(hit.getDocumentId(), hit.getTermPosition(), hit.getLength());
}
/**
* Get the text to the left of the given binding.
*
* @param hit
* the binding.
* @param numTokens
* the maximum number of tokens of context to return. The actual
* number of tokens returned may be smaller than this if the hit
* starts within <code>numTokens</code> tokens of the start of the
* document.
* @return an array of two string arrays, the first representing the tokens
* before the binding and the second the spaces after each token.
* @throws IOException
*/
public String[][] getLeftContext(Binding hit, int numTokens)
throws IndexException {
int startOffset = hit.getTermPosition() - numTokens;
// if numTokens is greater than the start offset of the hit
// then we need to return all the document text up to the
// token before the hit position (possibly no tokens...)
if(startOffset < 0) {
numTokens += startOffset; // startOffset is negative, so this will
// subtract from numTokens
startOffset = 0;
}
return getText(hit.getDocumentId(), startOffset, numTokens);
}
/**
* Get the text to the right of the given binding.
*
* @param hit
* the binding.
* @param numTokens
* the maximum number of tokens of context to return. The actual
* number of tokens returned may be smaller than this if the hit ends
* within <code>numTokens</code> tokens of the end of the document.
* @return an array of two string arrays, the first representing the tokens
* after the binding and the second the spaces after each token.
* @throws IOException
*/
public String[][] getRightContext(Binding hit, int numTokens)
throws IndexException {
DocumentData docData;
try {
docData = index.getDocumentData(hit.getDocumentId());
} catch(IOException e) {
throw new IndexException(e);
}
int startOffset = hit.getTermPosition() + hit.getLength();
if(startOffset >= docData.getTokens().length) {
// hit is at the end of the document
return new String[][]{new String[0], new String[0]};
}
if(startOffset + numTokens > docData.getTokens().length) {
// fewer than numTokens tokens of right context available, adjust
numTokens = docData.getTokens().length - startOffset;
}
return getText(hit.getDocumentId(), startOffset, numTokens);
}
/**
* Obtains the text for a specified region of a document. The return value is
* a pair of parallel arrays, one of tokens and the other of the spaces
* between them. If <code>length >= 0</code>, the two parallel arrays will
* always be exactly <code>length</code> items long, but any token positions
* that do not exist in the document (i.e. before the start or beyond the end
* of the text) will be <code>null</code>. If <code>length < 0</code> the
* arrays will be of sufficient length to hold all the tokens from
* <code>termPosition</code> to the end of the document, with no trailing
* <code>null</code>s (there may be leading <code>null</code>s if
* <code>termPosition < 0</code>).
*
* @param documentID
* the document ID
* @param termPosition
* the position of the first term required
* @param length
* the number of terms to return. May be negativem, in which case all
* terms from termPosition to the end of the document will be
* returned.
* @return an array of two string arrays. The first represents the tokens and
* the second represents the spaces between them
* @throws IndexException
*/
public String[][] getText(long documentID, int termPosition, int length)
throws IndexException {
try {
return index.getDocumentData(documentID).getText(termPosition, length);
} catch(IOException e) {
throw new IndexException(e);
}
}
/**
* Renders a document and a list of hits.
*
* @param docID
* the document to be rendered.
* @param hits
* the list of hits to be rendered.
* @param output
* the {@link Appendable} used to write the output.
* @throws IOException
* if the output cannot be written to.
* @throws IndexException
* if no document renderer is available.
*/
public void renderDocument(long docID, List<Binding> hits, Appendable output)
throws IOException, IndexException {
DocumentRenderer docRenderer = indexConfig.getDocumentRenderer();
if(docRenderer == null) { throw new IndexException(
"No document renderer is configured for this index!"); }
docRenderer.render(index.getDocumentData(docID), hits, output);
}
public String getDocumentTitle(long docID) throws IndexException {
try {
return index.getDocumentData(docID).getDocumentTitle();
} catch(IOException e) {
throw new IndexException(e);
}
}
public String getDocumentURI(long docID) throws IndexException {
try {
return index.getDocumentData(docID).getDocumentURI();
} catch(IOException e) {
throw new IndexException(e);
}
}
/**
* Obtains an arbitrary document metadata field from the stored document data.
* {@link DocumentMetadataHelper}s used at indexing time can add arbitrary
* {@link Serializable} values as metadata fields for the documents being
* indexed. This method is used at search time to retrieve those values.
*
* @param docID the ID of document for which the metadata is sought.
* @param fieldName the name of the metadata filed to be obtained
* @return the de-serialised value stored at indexing time for the given
* field name and document.
* @throws IndexException
*/
public Serializable getDocumentMetadataField(long docID, String fieldName)
throws IndexException {
try {
return index.getDocumentData(docID).getMetadataField(fieldName);
} catch(IOException e) {
throw new IndexException(e);
}
}
/**
* Closes this {@link QueryEngine} and releases all resources.
*/
public void close() {
// close all active query runners
List<QueryRunner> runnersCopy = new ArrayList<QueryRunner>(activeQueryRunners);
for(QueryRunner aRunner : runnersCopy) {
try {
logger.debug("Closing query runner: " + aRunner.toString());
aRunner.close();
} catch(IOException e) {
// log and ignore
logger.error("Exception while closing query runner.", e);
}
}
}
}