AbstractIndexTermsQuery.java

  1. /*
  2.  * AbstractIndexTermsQuery.java
  3.  *
  4.  * Copyright (c) 2007-2011, The University of Sheffield.
  5.  *
  6.  * This file is part of GATE Mímir (see http://gate.ac.uk/family/mimir.html),
  7.  * and is free software, licenced under the GNU Lesser General Public License,
  8.  * Version 3, June 2007 (also included with this distribution as file
  9.  * LICENCE-LGPL3.html).
  10.  *
  11.  * Valentin Tablan, 17 Jul 2012
  12.  *
  13.  * $Id: AbstractIndexTermsQuery.java 17255 2014-01-29 15:29:10Z valyt $
  14.  */
  15. package gate.mimir.search.terms;

  16. import gate.mimir.SemanticAnnotationHelper;
  17. import gate.mimir.index.AtomicIndex;
  18. import gate.mimir.search.IndexReaderPool;
  19. import gate.mimir.search.QueryEngine;
  20. import gate.mimir.search.QueryEngine.IndexType;
  21. import it.unimi.di.big.mg4j.search.DocumentIterator;
  22. import it.unimi.di.big.mg4j.search.visitor.CounterCollectionVisitor;
  23. import it.unimi.di.big.mg4j.search.visitor.CounterSetupVisitor;
  24. import it.unimi.di.big.mg4j.search.visitor.TermCollectionVisitor;
  25. import it.unimi.dsi.fastutil.ints.IntArrayList;
  26. import it.unimi.dsi.fastutil.objects.ObjectArrayList;

  27. import java.io.IOException;
  28. import java.util.HashSet;
  29. import java.util.Set;

  30. import org.slf4j.Logger;
  31. import org.slf4j.LoggerFactory;

  32. /**
  33.  * Base class for terms queries that use an MG4J direct index for their search.
  34.  */
  35. public abstract class AbstractIndexTermsQuery extends
  36.   AbstractDocumentsBasedTermsQuery {
  37.   /**
  38.    * Serialization ID.
  39.    */
  40.   private static final long serialVersionUID = 8382919427152317859L;

  41.   private static final Logger logger = LoggerFactory
  42.     .getLogger(AbstractIndexTermsQuery.class);

  43.   /**
  44.    * The name of the subindex in which the terms are sought. Each Mímir index
  45.    * includes multiple sub-indexes (some storing tokens, other storing
  46.    * annotations), identified by a name. For token indexes, the index name is
  47.    * the name of the token feature being indexed; for annotation indexes, the
  48.    * index name is the annotation type.
  49.    */
  50.   protected final String indexName;

  51.   /**
  52.    * The type of index being searched (tokens or annotations).
  53.    */
  54.   protected final IndexType indexType;

  55.   /**
  56.    * The atomic index used for executing the query. This includes both the
  57.    * inverted and the direct index (if configured).
  58.    */
  59.   protected transient AtomicIndex atomicIndex;

  60.   /**
  61.    * The semantic annotation helper for the correct annotation type (as given by
  62.    * {@link #indexName}), if {@link #indexType} is {@link IndexType#ANNOTATIONS}
  63.    * , <code>null</code> otherwise.
  64.    */
  65.   protected transient SemanticAnnotationHelper annotationHelper;

  66.   /**
  67.    * Should stop words be filtered out of the results?
  68.    */
  69.   protected boolean stopWordsBlocked = false;

  70.   /**
  71.    * Stop words set used for filtering out stop words. See
  72.    * {@link #stopWordsBlocked}.
  73.    */
  74.   protected Set<String> stopWords = null;

  75.   /**
  76.    * If set to true, term strings for annotation mentions are replaced with
  77.    * their description (see
  78.    * {@link SemanticAnnotationHelper#describeMention(String)}.
  79.    */
  80.   protected final boolean describeAnnotations;

  81.   /**
  82.    * The query engine used to execute this query.
  83.    */
  84.   protected transient QueryEngine engine;

  85.   protected final boolean countsEnabled;

  86.   /**
  87.    * @return the countsEnabled
  88.    */
  89.   public boolean isCountsEnabled() {
  90.     return countsEnabled;
  91.   }

  92.   /**
  93.    * The default set of stop words.
  94.    */
  95.   public static final String[] DEFAULT_STOP_WORDS = new String[]{",", ".", "?",
  96.     "!", ":", ";", "#", "~", "^", "@", "%", "&", "(", ")", "[", "]", "{", "}",
  97.     "|", "\\", "<", ">", "-", "+", "*", "/", "=", "'", "\"", "'s", "1", "2",
  98.     "3", "4", "5", "6", "7", "8", "9", "0", "a", "about", "above", "above",
  99.     "across", "after", "afterwards", "again", "against", "all", "almost",
  100.     "alone", "along", "already", "also", "although", "always", "am", "among",
  101.     "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow",
  102.     "anyone", "anything", "anyway", "anywhere", "are", "around", "as", "at",
  103.     "b", "back", "be", "became", "because", "become", "becomes", "becoming",
  104.     "been", "before", "beforehand", "behind", "being", "below", "beside",
  105.     "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "c",
  106.     "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry",
  107.     "d", "de", "describe", "detail", "do", "done", "down", "due", "during",
  108.     "e", "each", "eg", "eight", "either", "eleven", "else", "elsewhere",
  109.     "empty", "enough", "etc", "even", "ever", "every", "everyone",
  110.     "everything", "everywhere", "except", "f", "few", "fifteen", "fify",
  111.     "fill", "find", "fire", "first", "five", "for", "former", "formerly",
  112.     "forty", "found", "four", "from", "front", "full", "further", "g", "get",
  113.     "give", "go", "h", "had", "has", "hasnt", "have", "he", "hence", "her",
  114.     "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself",
  115.     "him", "himself", "his", "how", "however", "hundred", "i", "ie", "if",
  116.     "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself",
  117.     "j", "k", "keep", "l", "last", "latter", "latterly", "least", "less",
  118.     "ltd", "m", "made", "many", "may", "me", "meanwhile", "might", "mill",
  119.     "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my",
  120.     "myself", "n", "name", "namely", "neither", "never", "nevertheless",
  121.     "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing",
  122.     "now", "nowhere", "o", "of", "off", "often", "on", "once", "one", "only",
  123.     "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves",
  124.     "out", "over", "own", "p", "part", "per", "perhaps", "please", "put", "q",
  125.     "r", "rather", "re", "s", "same", "see", "seem", "seemed", "seeming",
  126.     "seems", "serious", "several", "she", "should", "show", "side", "since",
  127.     "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something",
  128.     "sometime", "sometimes", "somewhere", "still", "such", "system", "t",
  129.     "take", "ten", "than", "that", "the", "their", "them", "themselves",
  130.     "then", "thence", "there", "thereafter", "thereby", "therefore", "therein",
  131.     "thereupon", "these", "they", "thickv", "thin", "third", "this", "those",
  132.     "though", "three", "through", "throughout", "thru", "thus", "to",
  133.     "together", "too", "top", "toward", "towards", "twelve", "twenty", "two",
  134.     "u", "un", "under", "until", "up", "upon", "us", "v", "very", "via", "w",
  135.     "was", "we", "well", "were", "what", "whatever", "when", "whence",
  136.     "whenever", "where", "whereafter", "whereas", "whereby", "wherein",
  137.     "whereupon", "wherever", "whether", "which", "while", "whither", "who",
  138.     "whoever", "whole", "whom", "whose", "why", "will", "with", "within",
  139.     "without", "would", "x", "y", "yet", "you", "your", "yours", "yourself",
  140.     "yourselves", "z"};

  141.   /**
  142.    * @param indexName
  143.    *          The name of the subindex in which the terms are sought. Each Mímir
  144.    *          index includes multiple sub-indexes (some storing tokens, other
  145.    *          storing annotations), identified by a name. For token indexes, the
  146.    *          index name is the name of the token feature being indexed; for
  147.    *          annotation indexes, the index name is the annotation type.
  148.    * @param indexType
  149.    *          The type of index to be searched (tokens or annotations).
  150.    * @param countsEnabled
  151.    *          should term counts be obtained?
  152.    * @param describeAnnotations
  153.    *          If the index being interrogated is of type
  154.    *          {@link IndexType#ANNOTATIONS} then the indexed term strings are
  155.    *          URIs whose format depends on the actual implementation of the
  156.    *          index. These strings make little sense outside of the index. If
  157.    *          this is set to <code>true</code>, then term descriptions are also
  158.    *          included in the results set. See
  159.    *          {@link TermsResultSet#termDescriptions} and
  160.    *          {@link SemanticAnnotationHelper#describeMention(String)}. Setting
  161.    *          this to <code>true</code> has no effect if the index being
  162.    *          interrogated is a {@link IndexType#TOKENS} index.
  163.    */
  164.   public AbstractIndexTermsQuery(String indexName, IndexType indexType,
  165.                                  boolean countsEnabled,
  166.                                  boolean describeAnnotations,
  167.                                  long... documentIDs) {
  168.     super(documentIDs);
  169.     this.indexName = indexName;
  170.     this.indexType = indexType;
  171.     this.countsEnabled = countsEnabled;
  172.     this.describeAnnotations =
  173.       describeAnnotations && (indexType == IndexType.ANNOTATIONS);
  174.   }

  175.   /**
  176.    * Populates the internal state by obtaining references to the direct and
  177.    * indirect indexes from the {@link QueryEngine}.
  178.    *
  179.    * @param engine
  180.    *          the {@link QueryEngine} used to execute this query.
  181.    * @throws IllegalArgumentException
  182.    *           if the index represented by the provided query engine does not
  183.    *           have a direct index for the given sub-index (as specified by
  184.    *           {@link #indexType} and {@link #indexName}).
  185.    */
  186.   protected void prepare(QueryEngine engine) {
  187.     this.engine = engine;
  188.     switch(indexType){
  189.       case ANNOTATIONS:
  190.         atomicIndex = engine.getAnnotationIndex(indexName);
  191.         annotationHelper = engine.getAnnotationHelper(indexName);
  192.         break;
  193.       case TOKENS:
  194.         atomicIndex = engine.getTokenIndex(indexName);
  195.         break;
  196.       default:
  197.         throw new IllegalArgumentException("Invalid index type: " +
  198.           indexType.toString());
  199.     }
  200.     if(!atomicIndex.hasDirectIndex()) { throw new IllegalArgumentException(
  201.       "This type of query requires a " +
  202.         "direct index, but one was not found for (" +
  203.         indexType.toString().toLowerCase() + ") sub-index \"" + indexName +
  204.         "\""); }
  205.   }

  206.   protected TermsResultSet buildResultSet(DocumentIterator documentIterator)
  207.     throws IOException {
  208.     // prepare local data
  209.     ObjectArrayList<String> termStrings = new ObjectArrayList<String>();
  210.     ObjectArrayList<String> termDescriptions =
  211.       describeAnnotations ? new ObjectArrayList<String>() : null;
  212.     IntArrayList termCounts = countsEnabled ? new IntArrayList() : null;
  213.     TermCollectionVisitor termCollectionVisitor = null;
  214.     CounterSetupVisitor counterSetupVisitor = null;
  215.     CounterCollectionVisitor counterCollectionVisitor = null;
  216.     if(countsEnabled) {
  217.       termCollectionVisitor = new TermCollectionVisitor();
  218.       counterSetupVisitor = new CounterSetupVisitor(termCollectionVisitor);
  219.       counterCollectionVisitor =
  220.         new CounterCollectionVisitor(counterSetupVisitor);
  221.       termCollectionVisitor.prepare();
  222.       documentIterator.accept(termCollectionVisitor);
  223.       counterSetupVisitor.prepare();
  224.       documentIterator.accept(counterSetupVisitor);
  225.     }
  226.     if(stopWordsBlocked) {
  227.       // use the default list if no custom one was set
  228.       if(stopWords == null) setStopWords(DEFAULT_STOP_WORDS);
  229.     }
  230.     long termId = documentIterator.nextDocument();
  231.     terms: while(termId != DocumentIterator.END_OF_LIST && termId != -1) {
  232.       int termCount = -1;
  233.       if(countsEnabled) {
  234.         counterSetupVisitor.clear();
  235.         documentIterator.acceptOnTruePaths(counterCollectionVisitor);
  236.         termCount = 0;
  237.         for(int aCount : counterSetupVisitor.count)
  238.           termCount += aCount;
  239.       }
  240.       String termString = null;
  241.       // get the term string
  242.       try {
  243.         termString = atomicIndex.getDirectTerm(termId).toString();
  244.       } catch(Exception e) {
  245.         System.err.println("Error reading indirect index term with ID " +
  246.           termId);
  247.         e.printStackTrace();
  248.         termId = documentIterator.nextDocument();
  249.         continue terms;
  250.       }
  251.       if(stopWordsBlocked && stopWords.contains(termString)) {
  252.         // skip this term
  253.         termId = documentIterator.nextDocument();
  254.         continue terms;
  255.       }
  256.       if(indexType == IndexType.ANNOTATIONS) {
  257.         if(!annotationHelper.isMentionUri(termString)) {
  258.           // skip this term (not produced by our helper)
  259.           termId = documentIterator.nextDocument();
  260.           continue terms;
  261.         }
  262.         if(describeAnnotations) {
  263.           termDescriptions.add(annotationHelper.describeMention(termString));
  264.         }
  265.       }
  266.       termStrings.add(termString);
  267.       if(countsEnabled) {
  268.         termCounts.add(termCount);
  269.       }
  270.       termId = documentIterator.nextDocument();
  271.     }
  272.     // construct the result
  273.     TermsResultSet res =
  274.       new TermsResultSet(termStrings.toArray(new String[termStrings.size()]),
  275.         null, countsEnabled ? termCounts.toIntArray() : null,
  276.         describeAnnotations
  277.           ? termDescriptions.toArray(new String[termDescriptions.size()])
  278.           : null);
  279.     if(describeAnnotations) res = TermsResultSet.groupByDescription(res);
  280.     return res;
  281.   }

  282.   /**
  283.    * Should stop words be filtered out from the results? Defaults to
  284.    * <code>false</code>.
  285.    *
  286.    * @return the stopWordsBlocked
  287.    */
  288.   public boolean isStopWordsBlocked() {
  289.     return stopWordsBlocked;
  290.   }

  291.   /**
  292.    * Enables or disables the filtering of stop words from the results. If a
  293.    * custom list of stop words has been set (by calling
  294.    * {@link #setStopWords(String[])}) then it is used, otherwise the
  295.    * {@link #DEFAULT_STOP_WORDS} list is used.
  296.    *
  297.    * @param stopWordsBlocked
  298.    *          the stopWordsBlocked to set
  299.    */
  300.   public void setStopWordsBlocked(boolean stopWordsBlocked) {
  301.     this.stopWordsBlocked = stopWordsBlocked;
  302.   }

  303.   /**
  304.    * Gets the current custom list of stop words.
  305.    *
  306.    * @return the stopWords
  307.    */
  308.   public Set<String> getStopWords() {
  309.     return stopWords;
  310.   }

  311.   public void setStopWords(Set<String> stopWords) {
  312.     this.stopWords = new HashSet<String>(stopWords);
  313.   }

  314.   /**
  315.    * Sets the custom list of stop words that should be blocked from query
  316.    * results. The actual blocking also needs to be enabled by calling
  317.    * {@link #setStopWordsBlocked(boolean)}. If this array is set to
  318.    * <code>null<code>, then the
  319.    * {@link #DEFAULT_STOP_WORDS} are used.
  320.    *
  321.    * @param stopWords
  322.    *          the stopWords to set
  323.    */
  324.   public void setStopWords(String[] stopWords) {
  325.     this.stopWords = new HashSet<String>(stopWords.length);
  326.     for(String sw : stopWords)
  327.       this.stopWords.add(sw);
  328.   }
  329. }