QueryEngine.java

  1. /*
  2.  *  QueryEngine.java
  3.  *
  4.  *  Copyright (c) 2007-2011, The University of Sheffield.
  5.  *
  6.  *  This file is part of GATE Mímir (see http://gate.ac.uk/family/mimir.html),
  7.  *  and is free software, licenced under the GNU Lesser General Public License,
  8.  *  Version 3, June 2007 (also included with this distribution as file
  9.  *  LICENCE-LGPL3.html).
  10.  *
  11.  *  Valentin Tablan, 04 Mar 2009
  12.  *  
  13.  *  $Id: QueryEngine.java 17261 2014-01-30 14:05:14Z valyt $
  14.  */
  15. package gate.mimir.search;

  16. import gate.LanguageAnalyser;
  17. import gate.mimir.DocumentMetadataHelper;
  18. import gate.mimir.DocumentRenderer;
  19. import gate.mimir.IndexConfig;
  20. import gate.mimir.IndexConfig.SemanticIndexerConfig;
  21. import gate.mimir.MimirIndex;
  22. import gate.mimir.SemanticAnnotationHelper;
  23. import gate.mimir.index.AtomicAnnotationIndex;
  24. import gate.mimir.index.AtomicTokenIndex;
  25. import gate.mimir.index.DocumentData;
  26. import gate.mimir.index.IndexException;
  27. import gate.mimir.search.query.AnnotationQuery;
  28. import gate.mimir.search.query.Binding;
  29. import gate.mimir.search.query.QueryExecutor;
  30. import gate.mimir.search.query.QueryNode;
  31. import gate.mimir.search.query.parser.ParseException;
  32. import gate.mimir.search.query.parser.QueryParser;
  33. import gate.mimir.search.score.MimirScorer;

  34. import java.io.IOException;
  35. import java.io.Serializable;
  36. import java.util.ArrayList;
  37. import java.util.Collections;
  38. import java.util.List;
  39. import java.util.concurrent.Callable;
  40. import java.util.concurrent.Executor;

  41. import org.slf4j.Logger;
  42. import org.slf4j.LoggerFactory;

  43. /**
  44.  * This class represents the entry point to the Mimir search API.
  45.  */
  46. public class QueryEngine {
  47.  
  48.  
  49.   /**
  50.    * Represents the type of index that should be searched. Mimir uses two types
  51.    * of indexes: token indexes (which index the text input) and annotation
  52.    * indexes (which index semantic annotations).
  53.    */
  54.   public static enum IndexType{
  55.     /**
  56.      * Value representing token indexes, used for the document text.
  57.      */
  58.     TOKENS,
  59.    
  60.     /**
  61.      * Value representing annotation indexes, used for the document semantic
  62.      * annotations.
  63.      */
  64.     ANNOTATIONS
  65.   }
  66.  
  67.   /**
  68.    * The maximum size of an index that can be loaded in memory (by default 64
  69.    * MB).
  70.    */
  71.   public static final long MAX_IN_MEMORY_INDEX = 64 * 1024 * 1024;
  72.  
  73.   /**
  74.    * The default value for the document block size.
  75.    * @see #setDocumentBlockSize(int)
  76.    */
  77.   public static final int DEFAULT_DOCUMENT_BLOCK_SIZE = 1000;

  78.   /**
  79.    * The index being searched.
  80.    */
  81.   protected final MimirIndex index;

  82.   /**
  83.    * The index configuration this index was built from.
  84.    */
  85.   protected IndexConfig indexConfig;

  86.   /**
  87.    * Should sub-bindings be generated when searching?
  88.    */
  89.   protected boolean subBindingsEnabled;

  90.   /**
  91.    * A callable that produces new {@link MimirScorer} instances on request.
  92.    */
  93.   protected Callable<MimirScorer> scorerSource;
  94.  
  95.   protected static final Logger logger = LoggerFactory.getLogger(QueryEngine.class);

  96.   /**
  97.    * The tokeniser (technically any GATE LA) used to split the text segments
  98.    * found in queries into individual tokens. The same tokeniser used to create
  99.    * the indexed documents should be used here. If this value is not set, then a
  100.    * default ANNIE tokeniser will be used.
  101.    */
  102.   protected LanguageAnalyser queryTokeniser;

  103.   /**
  104.    * The executor used to run tasks for query execution. If the value is not
  105.    * set, then new threads are created as needed.
  106.    */
  107.   protected Executor executor;

  108.   /**
  109.    * How many documents get ranked in one ranking stage.
  110.    */
  111.   private int documentBlockSize = DEFAULT_DOCUMENT_BLOCK_SIZE;
  112.  
  113.   /**
  114.    * A list of currently active QueryRunners. This is used to close all active
  115.    * runners when the query engine itself is closed (thus releasing all open
  116.    * files).
  117.    */
  118.   private List<QueryRunner> activeQueryRunners;

  119.   /**
  120.    * Are sub-bindings used in this query engine. Sub-bindings are used to
  121.    * associate sub-queries with segments of the returned hits. This can be
  122.    * useful for showing high-level details about the returned hits. By default,
  123.    * sub-bindings are not used.
  124.    *
  125.    * @return the subBindingsEnabled
  126.    */
  127.   public boolean isSubBindingsEnabled() {
  128.     return subBindingsEnabled;
  129.   }

  130.   /**
  131.    * @param subBindingsEnabled
  132.    *          the subBindingsEnabled to set
  133.    */
  134.   public void setSubBindingsEnabled(boolean subBindingsEnabled) {
  135.     this.subBindingsEnabled = subBindingsEnabled;
  136.   }

  137.   /**
  138.    * Gets the configuration parameter specifying the number of documents that
  139.    * get processed as a block. This is used to optimise the search
  140.    * process by limiting the number of results that get calculated by default.
  141.    * @return
  142.    */
  143.   public int getDocumentBlockSize() {
  144.     return documentBlockSize;
  145.   }
  146.  
  147.   /**
  148.    * Sets the configuration parameter specifying the number of documents that
  149.    * get processed in one go (e.g. the number of documents that get ranked when
  150.    * enumerating results). This is used to optimise the search
  151.    * process by limiting the number of results that get calculated by default.
  152.    * Defaults to {@link #DEFAULT_DOCUMENT_BLOCK_SIZE}.
  153.    * @param documentBlockSize
  154.    */
  155.   public void setDocumentBlockSize(int documentBlockSize) {
  156.     this.documentBlockSize = documentBlockSize;
  157.   }

  158.   /**
  159.    * Gets the current source of scorers.
  160.    * @see #setScorerSource(Callable)
  161.    * @return
  162.    */
  163.   public Callable<MimirScorer> getScorerSource() {
  164.     return scorerSource;
  165.   }

  166.   /**
  167.    * Provides a {@link Callable} that the Query Engine can use for obtaining
  168.    * new instances of {@link MimirScorer} to be used for ranking new queries.
  169.    * @param scorerSource
  170.    */
  171.   public void setScorerSource(Callable<MimirScorer> scorerSource) {
  172.     this.scorerSource = scorerSource;
  173.   }

  174.   /**
  175.    * Gets the executor used by this query engine.
  176.    *
  177.    * @return an executor that can be used for running tasks pertinent to this
  178.    *         QueryEngine.
  179.    */
  180.   public Executor getExecutor() {
  181.     return executor;
  182.   }

  183.   /**
  184.    * Sets the {@link Executor} used for executing tasks required for running
  185.    * queries. This allows the use of some type thread pooling, is needed. If
  186.    * this value is not set, then new threads are created as required.
  187.    *
  188.    * @param executor
  189.    */
  190.   public void setExecutor(Executor executor) {
  191.     this.executor = executor;
  192.   }

  193.   /**
  194.    * Sets the tokeniser (technically any GATE analyser) used to split the text
  195.    * segments found in queries into individual tokens. The same tokeniser used
  196.    * to create the indexed documents should be used here. If this value is not
  197.    * set, then a default ANNIE tokeniser will be used.
  198.    *
  199.    * @param queryTokeniser
  200.    *          the new tokeniser to be used for parsing queries.
  201.    */
  202.   public void setQueryTokeniser(LanguageAnalyser queryTokeniser) {
  203.     this.queryTokeniser = queryTokeniser;
  204.   }

  205.   /**
  206.    * Finds the location for a given sub-index in the arrays returned by
  207.    * {@link #getIndexes()} and {@link #getDirectIndexes()}.
  208.    * @param indexType the IndexType of the requested sub-index (tokens or
  209.    * annotations).
  210.    * @param indexName the &quot;name&quot; of the requested sub-index (the
  211.    * indexed feature name for {@link IndexType#TOKENS} indexes, or the
  212.    * annotation type in the case of {@link IndexType#ANNOTATIONS} indexes).
  213.    * @return the position in the indexes array for the requested index, or -1 if
  214.    * the requested index does not exist.
  215.    */
  216.   public int getSubIndexPosition(IndexType indexType, String indexName) {
  217.     if(indexType == IndexType.TOKENS) {
  218.       for(int i = 0; i < indexConfig.getTokenIndexers().length; i++) {
  219.         if(indexConfig.getTokenIndexers()[i].getFeatureName().equals(indexName)) {
  220.           return i;
  221.         }
  222.       }
  223.       return -1;
  224.     } else if(indexType == IndexType.ANNOTATIONS) {
  225.       for(int i = 0; i < indexConfig.getSemanticIndexers().length; i++) {
  226.         for(String aType :
  227.             indexConfig.getSemanticIndexers()[i].getAnnotationTypes()) {
  228.           if(aType.equals(indexName)) {
  229.             return indexConfig.getTokenIndexers().length + i;
  230.           }
  231.         }
  232.       }      
  233.       return -1;
  234.     } else {
  235.       throw new IllegalArgumentException(
  236.         "Don't understand sub-indexes of type " + indexType);
  237.     }
  238.   }

  239.   /**
  240.    * Returns the index that stores the data for a particular feature of token
  241.    * annotations.
  242.    *
  243.    * @param featureName
  244.    * @return
  245.    */
  246.   public AtomicTokenIndex getTokenIndex(String featureName) {
  247.     return index.getTokenIndex(featureName);
  248.   }
  249.  
  250.   /**
  251.    * Returns the index that stores the data for a particular semantic annotation
  252.    * type.
  253.    *
  254.    * @param annotationType
  255.    * @return
  256.    */
  257.   public AtomicAnnotationIndex getAnnotationIndex(String annotationType) {
  258.     return index.getAnnotationIndex(annotationType);
  259.   }
  260.  
  261.   public SemanticAnnotationHelper getAnnotationHelper(String annotationType) {
  262.     for(int i = 0; i < indexConfig.getSemanticIndexers().length; i++) {
  263.       String[] annTypes = indexConfig.getSemanticIndexers()[i]
  264.           .getAnnotationTypes();
  265.       for(int j = 0; j < annTypes.length; j++) {
  266.         if(annTypes[j].equals(annotationType)) {
  267.           return indexConfig.getSemanticIndexers()[i].getHelpers()[j];
  268.         }
  269.       }
  270.     }
  271.     return null;
  272.   }
  273.  
  274.  
  275.   /**
  276.    * Gets the index this query engine is searching.
  277.    * @return
  278.    */
  279.   public MimirIndex getIndex() {
  280.     return index;
  281.   }

  282.   /**
  283.    * @return the index configuration for this index
  284.    */
  285.   public IndexConfig getIndexConfig() {
  286.     return indexConfig;
  287.   }

  288.  
  289.  
  290.   /**
  291.    * Constructs a new query engine for a {@link MimirIndex}.
  292.    * @param index the index to be searched.
  293.    */
  294.   public QueryEngine(MimirIndex index) {
  295.     this.index = index;
  296.     this.indexConfig = index.getIndexConfig();
  297.     activeQueryRunners = Collections.synchronizedList(
  298.         new ArrayList<QueryRunner>());
  299.     subBindingsEnabled = false;
  300.   }

  301. //  /**
  302. //   * Constructs a new {@link QueryEngine} for a specified Mimir index. The mimir
  303. //   * semantic repository will be initialized using the default location in the
  304. //   * filesystem, provided by the IndexConfig
  305. //   *
  306. //   * @param indexDir
  307. //   *          the directory containing an index.
  308. //   * @throws IndexException
  309. //   *           if there are problems while opening the indexes.
  310. //   */
  311. //  public QueryEngine(File indexDir) throws gate.mimir.index.IndexException {
  312. //    // read the index config
  313. //    try {
  314. //      indexConfig =
  315. //        IndexConfig.readConfigFromFile(new File(indexDir,
  316. //                Indexer.INDEX_CONFIG_FILENAME), indexDir);
  317. //      initMG4J();
  318. //      // initialise the semantic indexers
  319. //      if(indexConfig.getSemanticIndexers() != null &&
  320. //              indexConfig.getSemanticIndexers().length > 0) {
  321. //        for(SemanticIndexerConfig sic : indexConfig.getSemanticIndexers()){
  322. //          for(SemanticAnnotationHelper sah : sic.getHelpers()){
  323. //            sah.init(this);
  324. //            if(sah.getMode() == SemanticAnnotationHelper.Mode.DOCUMENT &&
  325. //                documentSizes == null) {
  326. //              // we need to load the document sizes from a token index
  327. //              documentSizes = getIndexes()[0].getIndex().sizes;
  328. //            }            
  329. //          }
  330. //        }
  331. //      }
  332. //      
  333. //      
  334. //      activeQueryRunners = Collections.synchronizedList(
  335. //              new ArrayList<QueryRunner>());
  336. //    } catch(FileNotFoundException e) {
  337. //      throw new IndexException("File not found!", e);
  338. //    } catch(IOException e) {
  339. //      throw new IndexException("Input/output exception!", e);
  340. //    }
  341. //    subBindingsEnabled = false;
  342. //
  343. //  }

  344.   /**
  345.    * Get the {@link SemanticAnnotationHelper} corresponding to a query's
  346.    * annotation type.
  347.    * @throws IllegalArgumentException if the annotation helper for this
  348.    *         type cannot be found.
  349.    */
  350.   public SemanticAnnotationHelper getAnnotationHelper(AnnotationQuery query) {
  351.     for(SemanticIndexerConfig semConfig : indexConfig.getSemanticIndexers()){
  352.       for(int i = 0; i < semConfig.getAnnotationTypes().length; i++){
  353.         if(query.getAnnotationType().equals(
  354.                 semConfig.getAnnotationTypes()[i])){
  355.           return semConfig.getHelpers()[i];
  356.         }
  357.       }
  358.     }
  359.     throw new IllegalArgumentException("Semantic annotation type \""
  360.             + query.getAnnotationType() + "\" not known to this query engine.");
  361.   }
  362.  
  363.  
  364.   /**
  365.    * Obtains a query executor for a given {@link QueryNode}.
  366.    *
  367.    * @param query
  368.    *          the query to be executed.
  369.    * @return a {@link QueryExecutor} for the provided query, running over the
  370.    *         indexes in this query engine.
  371.    * @throws IOException
  372.    *           if the index files cannot be accessed.
  373.    */
  374.   public QueryRunner getQueryRunner(QueryNode query) throws IOException {
  375.     logger.info("Executing query: " + query.toString());
  376.     QueryExecutor qExecutor = query.getQueryExecutor(this);
  377.     QueryRunner qRunner;
  378.     MimirScorer scorer = null;
  379.     try {
  380.       scorer = scorerSource == null ? null : scorerSource.call();
  381.     } catch(Exception e) {
  382.       logger.error("Could not obtain a scorer. Running query unranked.", e);
  383.     }
  384.     qRunner = new RankingQueryRunnerImpl(qExecutor, scorer);
  385.     activeQueryRunners.add(qRunner);
  386.     return qRunner;
  387.   }
  388.  
  389.   /**
  390.    * Notifies the QueryEngine that the given QueryRunner has been closed.
  391.    * @param qRunner
  392.    */
  393.   public void releaseQueryRunner(QueryRunner qRunner) {
  394.     activeQueryRunners.remove(qRunner);
  395.   }

  396.   /**
  397.    * Obtains a query executor for a given query, expressed as a String.
  398.    *
  399.    * @param query
  400.    *          the query to be executed.
  401.    * @return a {@link QueryExecutor} for the provided query, running over the
  402.    *         indexes in this query engine.
  403.    * @throws IOException
  404.    *           if the index files cannot be accessed.
  405.    * @throws ParseException
  406.    *           if the string provided for the query cannot be parsed.
  407.    */
  408.   public QueryRunner getQueryRunner(String query) throws IOException,
  409.   ParseException {
  410.     logger.info("Executing query: " + query.toString());
  411.     QueryNode qNode =
  412.       (queryTokeniser == null) ? QueryParser.parse(query) : QueryParser
  413.               .parse(query, queryTokeniser);
  414.       return getQueryRunner(qNode);
  415.   }

  416.   /**
  417.    * Obtains the document text for a given search hit.
  418.    *
  419.    * @param hit
  420.    *          the search hit for which the text is sought.
  421.    * @param leftContext
  422.    *          the number of tokens to the left of the hit to be included in the
  423.    *          result.
  424.    * @param rightContext
  425.    *          the number of tokens to the right of the hit to be included in the
  426.    *          result.
  427.    * @return an array of arrays of {@link String}s, representing the tokens and
  428.    *         spaces at the location of the search hit. The first element of the
  429.    *         array is an array of tokens, the second element contains the
  430.    *         spaces.The first element of each array corresponds to the first
  431.    *         token of the left context.
  432.    * @throws IOException
  433.    */
  434.   public String[][] getHitText(Binding hit, int leftContext, int rightContext)
  435.   throws IndexException {
  436.     return getText(hit.getDocumentId(), hit.getTermPosition() - leftContext,
  437.             leftContext + hit.getLength() + rightContext);
  438.   }

  439.   /**
  440.    * Gets the text covered by a given binding.
  441.    *
  442.    * @param hit
  443.    *          the binding.
  444.    * @return an array of two string arrays, the first representing the tokens
  445.    *         covered by the binding and the second the spaces after each token.
  446.    * @throws IOException
  447.    */
  448.   public String[][] getHitText(Binding hit) throws IndexException {
  449.     return getText(hit.getDocumentId(), hit.getTermPosition(), hit.getLength());
  450.   }

  451.   /**
  452.    * Get the text to the left of the given binding.
  453.    *
  454.    * @param hit
  455.    *          the binding.
  456.    * @param numTokens
  457.    *          the maximum number of tokens of context to return. The actual
  458.    *          number of tokens returned may be smaller than this if the hit
  459.    *          starts within <code>numTokens</code> tokens of the start of the
  460.    *          document.
  461.    * @return an array of two string arrays, the first representing the tokens
  462.    *         before the binding and the second the spaces after each token.
  463.    * @throws IOException
  464.    */
  465.   public String[][] getLeftContext(Binding hit, int numTokens)
  466.   throws IndexException {
  467.     int startOffset = hit.getTermPosition() - numTokens;
  468.     // if numTokens is greater than the start offset of the hit
  469.     // then we need to return all the document text up to the
  470.     // token before the hit position (possibly no tokens...)
  471.     if(startOffset < 0) {
  472.       numTokens += startOffset; // startOffset is negative, so this will
  473.       // subtract from numTokens
  474.       startOffset = 0;
  475.     }
  476.     return getText(hit.getDocumentId(), startOffset, numTokens);
  477.   }

  478.   /**
  479.    * Get the text to the right of the given binding.
  480.    *
  481.    * @param hit
  482.    *          the binding.
  483.    * @param numTokens
  484.    *          the maximum number of tokens of context to return. The actual
  485.    *          number of tokens returned may be smaller than this if the hit ends
  486.    *          within <code>numTokens</code> tokens of the end of the document.
  487.    * @return an array of two string arrays, the first representing the tokens
  488.    *         after the binding and the second the spaces after each token.
  489.    * @throws IOException
  490.    */
  491.   public String[][] getRightContext(Binding hit, int numTokens)
  492.   throws IndexException {
  493.     DocumentData docData;
  494.     try {
  495.       docData = index.getDocumentData(hit.getDocumentId());
  496.     } catch(IOException e) {
  497.       throw new IndexException(e);
  498.     }
  499.     int startOffset = hit.getTermPosition() + hit.getLength();
  500.     if(startOffset >= docData.getTokens().length) {
  501.       // hit is at the end of the document
  502.       return new String[][]{new String[0], new String[0]};
  503.     }
  504.     if(startOffset + numTokens > docData.getTokens().length) {
  505.       // fewer than numTokens tokens of right context available, adjust
  506.       numTokens = docData.getTokens().length - startOffset;
  507.     }
  508.     return getText(hit.getDocumentId(), startOffset, numTokens);
  509.   }

  510.   /**
  511.    * Obtains the text for a specified region of a document. The return value is
  512.    * a pair of parallel arrays, one of tokens and the other of the spaces
  513.    * between them. If <code>length >= 0</code>, the two parallel arrays will
  514.    * always be exactly <code>length</code> items long, but any token positions
  515.    * that do not exist in the document (i.e. before the start or beyond the end
  516.    * of the text) will be <code>null</code>. If <code>length &lt; 0</code> the
  517.    * arrays will be of sufficient length to hold all the tokens from
  518.    * <code>termPosition</code> to the end of the document, with no trailing
  519.    * <code>null</code>s (there may be leading <code>null</code>s if
  520.    * <code>termPosition &lt; 0</code>).
  521.    *
  522.    * @param documentID
  523.    *          the document ID
  524.    * @param termPosition
  525.    *          the position of the first term required
  526.    * @param length
  527.    *          the number of terms to return. May be negativem, in which case all
  528.    *          terms from termPosition to the end of the document will be
  529.    *          returned.
  530.    * @return an array of two string arrays. The first represents the tokens and
  531.    *         the second represents the spaces between them
  532.    * @throws IndexException
  533.    */
  534.   public String[][] getText(long documentID, int termPosition, int length)
  535.   throws IndexException {
  536.     try {
  537.       return index.getDocumentData(documentID).getText(termPosition, length);
  538.     } catch(IOException e) {
  539.       throw new IndexException(e);
  540.     }
  541.   }

  542.   /**
  543.    * Renders a document and a list of hits.
  544.    *
  545.    * @param docID
  546.    *          the document to be rendered.
  547.    * @param hits
  548.    *          the list of hits to be rendered.
  549.    * @param output
  550.    *          the {@link Appendable} used to write the output.
  551.    * @throws IOException
  552.    *           if the output cannot be written to.
  553.    * @throws IndexException
  554.    *           if no document renderer is available.
  555.    */
  556.   public void renderDocument(long docID, List<Binding> hits, Appendable output)
  557.   throws IOException, IndexException {
  558.     DocumentRenderer docRenderer = indexConfig.getDocumentRenderer();
  559.     if(docRenderer == null) { throw new IndexException(
  560.     "No document renderer is configured for this index!"); }
  561.     docRenderer.render(index.getDocumentData(docID), hits, output);
  562.   }

  563.   public String getDocumentTitle(long docID) throws IndexException {
  564.     try {
  565.       return index.getDocumentData(docID).getDocumentTitle();
  566.     } catch(IOException e) {
  567.       throw new IndexException(e);
  568.     }
  569.   }

  570.   public String getDocumentURI(long docID) throws IndexException {
  571.     try {
  572.       return index.getDocumentData(docID).getDocumentURI();
  573.     } catch(IOException e) {
  574.       throw new IndexException(e);
  575.     }
  576.   }

  577.   /**
  578.    * Obtains an arbitrary document metadata field from the stored document data.
  579.    * {@link DocumentMetadataHelper}s used at indexing time can add arbitrary
  580.    * {@link Serializable} values as metadata fields for the documents being
  581.    * indexed. This method is used at search time to retrieve those values.
  582.    *  
  583.    * @param docID the ID of document for which the metadata is sought.
  584.    * @param fieldName the name of the metadata filed to be obtained
  585.    * @return the de-serialised value stored at indexing time for the given
  586.    * field name and document.
  587.    * @throws IndexException
  588.    */
  589.   public Serializable getDocumentMetadataField(long docID, String fieldName)
  590.       throws IndexException {
  591.     try {
  592.       return index.getDocumentData(docID).getMetadataField(fieldName);
  593.     } catch(IOException e) {
  594.       throw new IndexException(e);
  595.     }
  596.   }
  597.  


  598.   /**
  599.    * Closes this {@link QueryEngine} and releases all resources.
  600.    */
  601.   public void close() {
  602.     // close all active query runners
  603.     List<QueryRunner> runnersCopy = new ArrayList<QueryRunner>(activeQueryRunners);
  604.     for(QueryRunner aRunner : runnersCopy) {
  605.       try {
  606.         logger.debug("Closing query runner: " + aRunner.toString());
  607.         aRunner.close();
  608.       } catch(IOException e) {
  609.         // log and ignore
  610.         logger.error("Exception while closing query runner.", e);
  611.       }
  612.     }
  613.   }

  614. }