AbstractIndexTermsQuery.java

/*
 * AbstractIndexTermsQuery.java
 * 
 * Copyright (c) 2007-2011, The University of Sheffield.
 * 
 * This file is part of GATE Mímir (see http://gate.ac.uk/family/mimir.html),
 * and is free software, licenced under the GNU Lesser General Public License,
 * Version 3, June 2007 (also included with this distribution as file
 * LICENCE-LGPL3.html).
 * 
 * Valentin Tablan, 17 Jul 2012
 * 
 * $Id: AbstractIndexTermsQuery.java 17255 2014-01-29 15:29:10Z valyt $
 */
package gate.mimir.search.terms;

import gate.mimir.SemanticAnnotationHelper;
import gate.mimir.index.AtomicIndex;
import gate.mimir.search.IndexReaderPool;
import gate.mimir.search.QueryEngine;
import gate.mimir.search.QueryEngine.IndexType;
import it.unimi.di.big.mg4j.search.DocumentIterator;
import it.unimi.di.big.mg4j.search.visitor.CounterCollectionVisitor;
import it.unimi.di.big.mg4j.search.visitor.CounterSetupVisitor;
import it.unimi.di.big.mg4j.search.visitor.TermCollectionVisitor;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.objects.ObjectArrayList;

import java.io.IOException;
import java.util.HashSet;
import java.util.Set;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Base class for terms queries that use an MG4J direct index for their search.
 */
public abstract class AbstractIndexTermsQuery extends
  AbstractDocumentsBasedTermsQuery {
  /**
   * Serialization ID.
   */
  private static final long serialVersionUID = 8382919427152317859L;

  private static final Logger logger = LoggerFactory
    .getLogger(AbstractIndexTermsQuery.class);

  /**
   * The name of the subindex in which the terms are sought. Each Mímir index
   * includes multiple sub-indexes (some storing tokens, other storing
   * annotations), identified by a name. For token indexes, the index name is
   * the name of the token feature being indexed; for annotation indexes, the
   * index name is the annotation type.
   */
  protected final String indexName;

  /**
   * The type of index being searched (tokens or annotations).
   */
  protected final IndexType indexType;

  /**
   * The atomic index used for executing the query. This includes both the 
   * inverted and the direct index (if configured).
   */
  protected transient AtomicIndex atomicIndex;

  /**
   * The semantic annotation helper for the correct annotation type (as given by
   * {@link #indexName}), if {@link #indexType} is {@link IndexType#ANNOTATIONS}
   * , <code>null</code> otherwise.
   */
  protected transient SemanticAnnotationHelper annotationHelper;

  /**
   * Should stop words be filtered out of the results?
   */
  protected boolean stopWordsBlocked = false;

  /**
   * Stop words set used for filtering out stop words. See
   * {@link #stopWordsBlocked}.
   */
  protected Set<String> stopWords = null;

  /**
   * If set to true, term strings for annotation mentions are replaced with
   * their description (see
   * {@link SemanticAnnotationHelper#describeMention(String)}.
   */
  protected final boolean describeAnnotations;

  /**
   * The query engine used to execute this query.
   */
  protected transient QueryEngine engine;

  protected final boolean countsEnabled;

  /**
   * @return the countsEnabled
   */
  public boolean isCountsEnabled() {
    return countsEnabled;
  }

  /**
   * The default set of stop words.
   */
  public static final String[] DEFAULT_STOP_WORDS = new String[]{",", ".", "?",
    "!", ":", ";", "#", "~", "^", "@", "%", "&", "(", ")", "[", "]", "{", "}",
    "|", "\\", "<", ">", "-", "+", "*", "/", "=", "'", "\"", "'s", "1", "2",
    "3", "4", "5", "6", "7", "8", "9", "0", "a", "about", "above", "above",
    "across", "after", "afterwards", "again", "against", "all", "almost",
    "alone", "along", "already", "also", "although", "always", "am", "among",
    "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow",
    "anyone", "anything", "anyway", "anywhere", "are", "around", "as", "at",
    "b", "back", "be", "became", "because", "become", "becomes", "becoming",
    "been", "before", "beforehand", "behind", "being", "below", "beside",
    "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "c",
    "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry",
    "d", "de", "describe", "detail", "do", "done", "down", "due", "during",
    "e", "each", "eg", "eight", "either", "eleven", "else", "elsewhere",
    "empty", "enough", "etc", "even", "ever", "every", "everyone",
    "everything", "everywhere", "except", "f", "few", "fifteen", "fify",
    "fill", "find", "fire", "first", "five", "for", "former", "formerly",
    "forty", "found", "four", "from", "front", "full", "further", "g", "get",
    "give", "go", "h", "had", "has", "hasnt", "have", "he", "hence", "her",
    "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself",
    "him", "himself", "his", "how", "however", "hundred", "i", "ie", "if",
    "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself",
    "j", "k", "keep", "l", "last", "latter", "latterly", "least", "less",
    "ltd", "m", "made", "many", "may", "me", "meanwhile", "might", "mill",
    "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my",
    "myself", "n", "name", "namely", "neither", "never", "nevertheless",
    "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing",
    "now", "nowhere", "o", "of", "off", "often", "on", "once", "one", "only",
    "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves",
    "out", "over", "own", "p", "part", "per", "perhaps", "please", "put", "q",
    "r", "rather", "re", "s", "same", "see", "seem", "seemed", "seeming",
    "seems", "serious", "several", "she", "should", "show", "side", "since",
    "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something",
    "sometime", "sometimes", "somewhere", "still", "such", "system", "t",
    "take", "ten", "than", "that", "the", "their", "them", "themselves",
    "then", "thence", "there", "thereafter", "thereby", "therefore", "therein",
    "thereupon", "these", "they", "thickv", "thin", "third", "this", "those",
    "though", "three", "through", "throughout", "thru", "thus", "to",
    "together", "too", "top", "toward", "towards", "twelve", "twenty", "two",
    "u", "un", "under", "until", "up", "upon", "us", "v", "very", "via", "w",
    "was", "we", "well", "were", "what", "whatever", "when", "whence",
    "whenever", "where", "whereafter", "whereas", "whereby", "wherein",
    "whereupon", "wherever", "whether", "which", "while", "whither", "who",
    "whoever", "whole", "whom", "whose", "why", "will", "with", "within",
    "without", "would", "x", "y", "yet", "you", "your", "yours", "yourself",
    "yourselves", "z"};

  /**
   * @param indexName
   *          The name of the subindex in which the terms are sought. Each Mímir
   *          index includes multiple sub-indexes (some storing tokens, other
   *          storing annotations), identified by a name. For token indexes, the
   *          index name is the name of the token feature being indexed; for
   *          annotation indexes, the index name is the annotation type.
   * @param indexType
   *          The type of index to be searched (tokens or annotations).
   * @param countsEnabled
   *          should term counts be obtained?
   * @param describeAnnotations
   *          If the index being interrogated is of type
   *          {@link IndexType#ANNOTATIONS} then the indexed term strings are
   *          URIs whose format depends on the actual implementation of the
   *          index. These strings make little sense outside of the index. If
   *          this is set to <code>true</code>, then term descriptions are also
   *          included in the results set. See
   *          {@link TermsResultSet#termDescriptions} and
   *          {@link SemanticAnnotationHelper#describeMention(String)}. Setting
   *          this to <code>true</code> has no effect if the index being
   *          interrogated is a {@link IndexType#TOKENS} index.
   */
  public AbstractIndexTermsQuery(String indexName, IndexType indexType,
                                 boolean countsEnabled,
                                 boolean describeAnnotations,
                                 long... documentIDs) {
    super(documentIDs);
    this.indexName = indexName;
    this.indexType = indexType;
    this.countsEnabled = countsEnabled;
    this.describeAnnotations =
      describeAnnotations && (indexType == IndexType.ANNOTATIONS);
  }

  /**
   * Populates the internal state by obtaining references to the direct and
   * indirect indexes from the {@link QueryEngine}.
   * 
   * @param engine
   *          the {@link QueryEngine} used to execute this query.
   * @throws IllegalArgumentException
   *           if the index represented by the provided query engine does not
   *           have a direct index for the given sub-index (as specified by
   *           {@link #indexType} and {@link #indexName}).
   */
  protected void prepare(QueryEngine engine) {
    this.engine = engine;
    switch(indexType){
      case ANNOTATIONS:
        atomicIndex = engine.getAnnotationIndex(indexName);
        annotationHelper = engine.getAnnotationHelper(indexName);
        break;
      case TOKENS:
        atomicIndex = engine.getTokenIndex(indexName);
        break;
      default:
        throw new IllegalArgumentException("Invalid index type: " +
          indexType.toString());
    }
    if(!atomicIndex.hasDirectIndex()) { throw new IllegalArgumentException(
      "This type of query requires a " +
        "direct index, but one was not found for (" +
        indexType.toString().toLowerCase() + ") sub-index \"" + indexName +
        "\""); }
  }

  protected TermsResultSet buildResultSet(DocumentIterator documentIterator)
    throws IOException {
    // prepare local data
    ObjectArrayList<String> termStrings = new ObjectArrayList<String>();
    ObjectArrayList<String> termDescriptions =
      describeAnnotations ? new ObjectArrayList<String>() : null;
    IntArrayList termCounts = countsEnabled ? new IntArrayList() : null;
    TermCollectionVisitor termCollectionVisitor = null;
    CounterSetupVisitor counterSetupVisitor = null;
    CounterCollectionVisitor counterCollectionVisitor = null;
    if(countsEnabled) {
      termCollectionVisitor = new TermCollectionVisitor();
      counterSetupVisitor = new CounterSetupVisitor(termCollectionVisitor);
      counterCollectionVisitor =
        new CounterCollectionVisitor(counterSetupVisitor);
      termCollectionVisitor.prepare();
      documentIterator.accept(termCollectionVisitor);
      counterSetupVisitor.prepare();
      documentIterator.accept(counterSetupVisitor);
    }
    if(stopWordsBlocked) {
      // use the default list if no custom one was set
      if(stopWords == null) setStopWords(DEFAULT_STOP_WORDS);
    }
    long termId = documentIterator.nextDocument();
    terms: while(termId != DocumentIterator.END_OF_LIST && termId != -1) {
      int termCount = -1;
      if(countsEnabled) {
        counterSetupVisitor.clear();
        documentIterator.acceptOnTruePaths(counterCollectionVisitor);
        termCount = 0;
        for(int aCount : counterSetupVisitor.count)
          termCount += aCount;
      }
      String termString = null;
      // get the term string
      try {
        termString = atomicIndex.getDirectTerm(termId).toString();
      } catch(Exception e) {
        System.err.println("Error reading indirect index term with ID " +
          termId);
        e.printStackTrace();
        termId = documentIterator.nextDocument();
        continue terms;
      }
      if(stopWordsBlocked && stopWords.contains(termString)) {
        // skip this term
        termId = documentIterator.nextDocument();
        continue terms;
      }
      if(indexType == IndexType.ANNOTATIONS) {
        if(!annotationHelper.isMentionUri(termString)) {
          // skip this term (not produced by our helper)
          termId = documentIterator.nextDocument();
          continue terms;
        }
        if(describeAnnotations) {
          termDescriptions.add(annotationHelper.describeMention(termString));
        }
      }
      termStrings.add(termString);
      if(countsEnabled) {
        termCounts.add(termCount);
      }
      termId = documentIterator.nextDocument();
    }
    // construct the result
    TermsResultSet res =
      new TermsResultSet(termStrings.toArray(new String[termStrings.size()]),
        null, countsEnabled ? termCounts.toIntArray() : null,
        describeAnnotations
          ? termDescriptions.toArray(new String[termDescriptions.size()])
          : null);
    if(describeAnnotations) res = TermsResultSet.groupByDescription(res);
    return res;
  }

  /**
   * Should stop words be filtered out from the results? Defaults to
   * <code>false</code>.
   * 
   * @return the stopWordsBlocked
   */
  public boolean isStopWordsBlocked() {
    return stopWordsBlocked;
  }

  /**
   * Enables or disables the filtering of stop words from the results. If a
   * custom list of stop words has been set (by calling
   * {@link #setStopWords(String[])}) then it is used, otherwise the
   * {@link #DEFAULT_STOP_WORDS} list is used.
   * 
   * @param stopWordsBlocked
   *          the stopWordsBlocked to set
   */
  public void setStopWordsBlocked(boolean stopWordsBlocked) {
    this.stopWordsBlocked = stopWordsBlocked;
  }

  /**
   * Gets the current custom list of stop words.
   * 
   * @return the stopWords
   */
  public Set<String> getStopWords() {
    return stopWords;
  }

  public void setStopWords(Set<String> stopWords) {
    this.stopWords = new HashSet<String>(stopWords);
  }

  /**
   * Sets the custom list of stop words that should be blocked from query
   * results. The actual blocking also needs to be enabled by calling
   * {@link #setStopWordsBlocked(boolean)}. If this array is set to
   * <code>null<code>, then the 
   * {@link #DEFAULT_STOP_WORDS} are used.
   * 
   * @param stopWords
   *          the stopWords to set
   */
  public void setStopWords(String[] stopWords) {
    this.stopWords = new HashSet<String>(stopWords.length);
    for(String sw : stopWords)
      this.stopWords.add(sw);
  }
}