AbstractIndexTermsQuery.java
- /*
- * AbstractIndexTermsQuery.java
- *
- * Copyright (c) 2007-2011, The University of Sheffield.
- *
- * This file is part of GATE Mímir (see http://gate.ac.uk/family/mimir.html),
- * and is free software, licenced under the GNU Lesser General Public License,
- * Version 3, June 2007 (also included with this distribution as file
- * LICENCE-LGPL3.html).
- *
- * Valentin Tablan, 17 Jul 2012
- *
- * $Id: AbstractIndexTermsQuery.java 17255 2014-01-29 15:29:10Z valyt $
- */
- package gate.mimir.search.terms;
- import gate.mimir.SemanticAnnotationHelper;
- import gate.mimir.index.AtomicIndex;
- import gate.mimir.search.IndexReaderPool;
- import gate.mimir.search.QueryEngine;
- import gate.mimir.search.QueryEngine.IndexType;
- import it.unimi.di.big.mg4j.search.DocumentIterator;
- import it.unimi.di.big.mg4j.search.visitor.CounterCollectionVisitor;
- import it.unimi.di.big.mg4j.search.visitor.CounterSetupVisitor;
- import it.unimi.di.big.mg4j.search.visitor.TermCollectionVisitor;
- import it.unimi.dsi.fastutil.ints.IntArrayList;
- import it.unimi.dsi.fastutil.objects.ObjectArrayList;
- import java.io.IOException;
- import java.util.HashSet;
- import java.util.Set;
- import org.slf4j.Logger;
- import org.slf4j.LoggerFactory;
- /**
- * Base class for terms queries that use an MG4J direct index for their search.
- */
- public abstract class AbstractIndexTermsQuery extends
- AbstractDocumentsBasedTermsQuery {
- /**
- * Serialization ID.
- */
- private static final long serialVersionUID = 8382919427152317859L;
- private static final Logger logger = LoggerFactory
- .getLogger(AbstractIndexTermsQuery.class);
- /**
- * The name of the subindex in which the terms are sought. Each Mímir index
- * includes multiple sub-indexes (some storing tokens, other storing
- * annotations), identified by a name. For token indexes, the index name is
- * the name of the token feature being indexed; for annotation indexes, the
- * index name is the annotation type.
- */
- protected final String indexName;
- /**
- * The type of index being searched (tokens or annotations).
- */
- protected final IndexType indexType;
- /**
- * The atomic index used for executing the query. This includes both the
- * inverted and the direct index (if configured).
- */
- protected transient AtomicIndex atomicIndex;
- /**
- * The semantic annotation helper for the correct annotation type (as given by
- * {@link #indexName}), if {@link #indexType} is {@link IndexType#ANNOTATIONS}
- * , <code>null</code> otherwise.
- */
- protected transient SemanticAnnotationHelper annotationHelper;
- /**
- * Should stop words be filtered out of the results?
- */
- protected boolean stopWordsBlocked = false;
- /**
- * Stop words set used for filtering out stop words. See
- * {@link #stopWordsBlocked}.
- */
- protected Set<String> stopWords = null;
- /**
- * If set to true, term strings for annotation mentions are replaced with
- * their description (see
- * {@link SemanticAnnotationHelper#describeMention(String)}.
- */
- protected final boolean describeAnnotations;
- /**
- * The query engine used to execute this query.
- */
- protected transient QueryEngine engine;
- protected final boolean countsEnabled;
- /**
- * @return the countsEnabled
- */
- public boolean isCountsEnabled() {
- return countsEnabled;
- }
- /**
- * The default set of stop words.
- */
- public static final String[] DEFAULT_STOP_WORDS = new String[]{",", ".", "?",
- "!", ":", ";", "#", "~", "^", "@", "%", "&", "(", ")", "[", "]", "{", "}",
- "|", "\\", "<", ">", "-", "+", "*", "/", "=", "'", "\"", "'s", "1", "2",
- "3", "4", "5", "6", "7", "8", "9", "0", "a", "about", "above", "above",
- "across", "after", "afterwards", "again", "against", "all", "almost",
- "alone", "along", "already", "also", "although", "always", "am", "among",
- "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow",
- "anyone", "anything", "anyway", "anywhere", "are", "around", "as", "at",
- "b", "back", "be", "became", "because", "become", "becomes", "becoming",
- "been", "before", "beforehand", "behind", "being", "below", "beside",
- "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "c",
- "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry",
- "d", "de", "describe", "detail", "do", "done", "down", "due", "during",
- "e", "each", "eg", "eight", "either", "eleven", "else", "elsewhere",
- "empty", "enough", "etc", "even", "ever", "every", "everyone",
- "everything", "everywhere", "except", "f", "few", "fifteen", "fify",
- "fill", "find", "fire", "first", "five", "for", "former", "formerly",
- "forty", "found", "four", "from", "front", "full", "further", "g", "get",
- "give", "go", "h", "had", "has", "hasnt", "have", "he", "hence", "her",
- "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself",
- "him", "himself", "his", "how", "however", "hundred", "i", "ie", "if",
- "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself",
- "j", "k", "keep", "l", "last", "latter", "latterly", "least", "less",
- "ltd", "m", "made", "many", "may", "me", "meanwhile", "might", "mill",
- "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my",
- "myself", "n", "name", "namely", "neither", "never", "nevertheless",
- "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing",
- "now", "nowhere", "o", "of", "off", "often", "on", "once", "one", "only",
- "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves",
- "out", "over", "own", "p", "part", "per", "perhaps", "please", "put", "q",
- "r", "rather", "re", "s", "same", "see", "seem", "seemed", "seeming",
- "seems", "serious", "several", "she", "should", "show", "side", "since",
- "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something",
- "sometime", "sometimes", "somewhere", "still", "such", "system", "t",
- "take", "ten", "than", "that", "the", "their", "them", "themselves",
- "then", "thence", "there", "thereafter", "thereby", "therefore", "therein",
- "thereupon", "these", "they", "thickv", "thin", "third", "this", "those",
- "though", "three", "through", "throughout", "thru", "thus", "to",
- "together", "too", "top", "toward", "towards", "twelve", "twenty", "two",
- "u", "un", "under", "until", "up", "upon", "us", "v", "very", "via", "w",
- "was", "we", "well", "were", "what", "whatever", "when", "whence",
- "whenever", "where", "whereafter", "whereas", "whereby", "wherein",
- "whereupon", "wherever", "whether", "which", "while", "whither", "who",
- "whoever", "whole", "whom", "whose", "why", "will", "with", "within",
- "without", "would", "x", "y", "yet", "you", "your", "yours", "yourself",
- "yourselves", "z"};
- /**
- * @param indexName
- * The name of the subindex in which the terms are sought. Each Mímir
- * index includes multiple sub-indexes (some storing tokens, other
- * storing annotations), identified by a name. For token indexes, the
- * index name is the name of the token feature being indexed; for
- * annotation indexes, the index name is the annotation type.
- * @param indexType
- * The type of index to be searched (tokens or annotations).
- * @param countsEnabled
- * should term counts be obtained?
- * @param describeAnnotations
- * If the index being interrogated is of type
- * {@link IndexType#ANNOTATIONS} then the indexed term strings are
- * URIs whose format depends on the actual implementation of the
- * index. These strings make little sense outside of the index. If
- * this is set to <code>true</code>, then term descriptions are also
- * included in the results set. See
- * {@link TermsResultSet#termDescriptions} and
- * {@link SemanticAnnotationHelper#describeMention(String)}. Setting
- * this to <code>true</code> has no effect if the index being
- * interrogated is a {@link IndexType#TOKENS} index.
- */
- public AbstractIndexTermsQuery(String indexName, IndexType indexType,
- boolean countsEnabled,
- boolean describeAnnotations,
- long... documentIDs) {
- super(documentIDs);
- this.indexName = indexName;
- this.indexType = indexType;
- this.countsEnabled = countsEnabled;
- this.describeAnnotations =
- describeAnnotations && (indexType == IndexType.ANNOTATIONS);
- }
- /**
- * Populates the internal state by obtaining references to the direct and
- * indirect indexes from the {@link QueryEngine}.
- *
- * @param engine
- * the {@link QueryEngine} used to execute this query.
- * @throws IllegalArgumentException
- * if the index represented by the provided query engine does not
- * have a direct index for the given sub-index (as specified by
- * {@link #indexType} and {@link #indexName}).
- */
- protected void prepare(QueryEngine engine) {
- this.engine = engine;
- switch(indexType){
- case ANNOTATIONS:
- atomicIndex = engine.getAnnotationIndex(indexName);
- annotationHelper = engine.getAnnotationHelper(indexName);
- break;
- case TOKENS:
- atomicIndex = engine.getTokenIndex(indexName);
- break;
- default:
- throw new IllegalArgumentException("Invalid index type: " +
- indexType.toString());
- }
- if(!atomicIndex.hasDirectIndex()) { throw new IllegalArgumentException(
- "This type of query requires a " +
- "direct index, but one was not found for (" +
- indexType.toString().toLowerCase() + ") sub-index \"" + indexName +
- "\""); }
- }
- protected TermsResultSet buildResultSet(DocumentIterator documentIterator)
- throws IOException {
- // prepare local data
- ObjectArrayList<String> termStrings = new ObjectArrayList<String>();
- ObjectArrayList<String> termDescriptions =
- describeAnnotations ? new ObjectArrayList<String>() : null;
- IntArrayList termCounts = countsEnabled ? new IntArrayList() : null;
- TermCollectionVisitor termCollectionVisitor = null;
- CounterSetupVisitor counterSetupVisitor = null;
- CounterCollectionVisitor counterCollectionVisitor = null;
- if(countsEnabled) {
- termCollectionVisitor = new TermCollectionVisitor();
- counterSetupVisitor = new CounterSetupVisitor(termCollectionVisitor);
- counterCollectionVisitor =
- new CounterCollectionVisitor(counterSetupVisitor);
- termCollectionVisitor.prepare();
- documentIterator.accept(termCollectionVisitor);
- counterSetupVisitor.prepare();
- documentIterator.accept(counterSetupVisitor);
- }
- if(stopWordsBlocked) {
- // use the default list if no custom one was set
- if(stopWords == null) setStopWords(DEFAULT_STOP_WORDS);
- }
- long termId = documentIterator.nextDocument();
- terms: while(termId != DocumentIterator.END_OF_LIST && termId != -1) {
- int termCount = -1;
- if(countsEnabled) {
- counterSetupVisitor.clear();
- documentIterator.acceptOnTruePaths(counterCollectionVisitor);
- termCount = 0;
- for(int aCount : counterSetupVisitor.count)
- termCount += aCount;
- }
- String termString = null;
- // get the term string
- try {
- termString = atomicIndex.getDirectTerm(termId).toString();
- } catch(Exception e) {
- System.err.println("Error reading indirect index term with ID " +
- termId);
- e.printStackTrace();
- termId = documentIterator.nextDocument();
- continue terms;
- }
- if(stopWordsBlocked && stopWords.contains(termString)) {
- // skip this term
- termId = documentIterator.nextDocument();
- continue terms;
- }
- if(indexType == IndexType.ANNOTATIONS) {
- if(!annotationHelper.isMentionUri(termString)) {
- // skip this term (not produced by our helper)
- termId = documentIterator.nextDocument();
- continue terms;
- }
- if(describeAnnotations) {
- termDescriptions.add(annotationHelper.describeMention(termString));
- }
- }
- termStrings.add(termString);
- if(countsEnabled) {
- termCounts.add(termCount);
- }
- termId = documentIterator.nextDocument();
- }
- // construct the result
- TermsResultSet res =
- new TermsResultSet(termStrings.toArray(new String[termStrings.size()]),
- null, countsEnabled ? termCounts.toIntArray() : null,
- describeAnnotations
- ? termDescriptions.toArray(new String[termDescriptions.size()])
- : null);
- if(describeAnnotations) res = TermsResultSet.groupByDescription(res);
- return res;
- }
- /**
- * Should stop words be filtered out from the results? Defaults to
- * <code>false</code>.
- *
- * @return the stopWordsBlocked
- */
- public boolean isStopWordsBlocked() {
- return stopWordsBlocked;
- }
- /**
- * Enables or disables the filtering of stop words from the results. If a
- * custom list of stop words has been set (by calling
- * {@link #setStopWords(String[])}) then it is used, otherwise the
- * {@link #DEFAULT_STOP_WORDS} list is used.
- *
- * @param stopWordsBlocked
- * the stopWordsBlocked to set
- */
- public void setStopWordsBlocked(boolean stopWordsBlocked) {
- this.stopWordsBlocked = stopWordsBlocked;
- }
- /**
- * Gets the current custom list of stop words.
- *
- * @return the stopWords
- */
- public Set<String> getStopWords() {
- return stopWords;
- }
- public void setStopWords(Set<String> stopWords) {
- this.stopWords = new HashSet<String>(stopWords);
- }
- /**
- * Sets the custom list of stop words that should be blocked from query
- * results. The actual blocking also needs to be enabled by calling
- * {@link #setStopWordsBlocked(boolean)}. If this array is set to
- * <code>null<code>, then the
- * {@link #DEFAULT_STOP_WORDS} are used.
- *
- * @param stopWords
- * the stopWords to set
- */
- public void setStopWords(String[] stopWords) {
- this.stopWords = new HashSet<String>(stopWords.length);
- for(String sw : stopWords)
- this.stopWords.add(sw);
- }
- }