TermQuery.java

  1. /*
  2.  *  TermQuery.java
  3.  *
  4.  *  Copyright (c) 2007-2011, The University of Sheffield.
  5.  *
  6.  *  This file is part of GATE Mímir (see http://gate.ac.uk/family/mimir.html),
  7.  *  and is free software, licenced under the GNU Lesser General Public License,
  8.  *  Version 3, June 2007 (also included with this distribution as file
  9.  *  LICENCE-LGPL3.html).
  10.  *
  11.  *  Valentin Tablan, 03 Mar 2009
  12.  *  
  13.  *  $Id: TermQuery.java 20208 2017-04-19 08:35:28Z domrout $
  14.  */

  15. package gate.mimir.search.query;

  16. import gate.mimir.IndexConfig;
  17. import gate.mimir.index.AtomicIndex;
  18. import gate.mimir.search.IndexReaderPool;
  19. import gate.mimir.search.QueryEngine;
  20. import it.unimi.dsi.fastutil.ints.IntIterator;
  21. import it.unimi.dsi.fastutil.objects.Reference2ReferenceMap;
  22. import it.unimi.dsi.fastutil.objects.ReferenceSet;
  23. import it.unimi.dsi.lang.MutableString;
  24. import it.unimi.di.big.mg4j.index.Index;
  25. import it.unimi.di.big.mg4j.index.IndexIterator;
  26. import it.unimi.di.big.mg4j.index.IndexIterators;
  27. import it.unimi.di.big.mg4j.index.IndexReader;
  28. import it.unimi.di.big.mg4j.index.payload.Payload;
  29. import it.unimi.di.big.mg4j.search.DocumentIterator;
  30. import it.unimi.di.big.mg4j.search.IntervalIterator;
  31. import it.unimi.di.big.mg4j.search.visitor.DocumentIteratorVisitor;

  32. import java.io.IOException;

  33. import static gate.mimir.search.QueryEngine.IndexType;

  34. /**
  35.  * A {@link QueryNode} for term queries. A term query consists of an index name
  36.  * and a query term.
  37.  */
  38. public class TermQuery implements QueryNode {

  39.   private static final long serialVersionUID = 7302348587893649887L;

  40.   /**
  41.    * The query term
  42.    */
  43.   private String term;
  44.  
  45.   /**
  46.    * The term ID for this query. If not known,
  47.    * {@link DocumentIterator#END_OF_LIST} is used.
  48.    */
  49.   private long termId = DocumentIterator.END_OF_LIST;
  50.  
  51.  
  52.   /**
  53.    * The name of the index to search.
  54.    */
  55.   private String indexName;
  56.  
  57.   /**
  58.    * The type of the index to be searched.
  59.    */
  60.   private IndexType indexType;
  61.  
  62.   /**
  63.    * The length of the matches. Defaults to <code>1</code>.
  64.    */
  65.   private int length;
  66.  
  67.  
  68.   /**
  69.    * A {@link QueryExecutor} for {@link TermQuery} nodes.
  70.    */
  71.   public static class TermQueryExecutor extends AbstractQueryExecutor implements IndexIterator{

  72.    
  73.     /**
  74.      * The {@link TermQuery} node being executed.
  75.      */
  76.     private TermQuery query;
  77.    
  78.     /**
  79.      * A local reference to the {@link IndexReaderPool} from the
  80.      * {@link QueryEngine}.
  81.      */
  82.     private AtomicIndex atomicIndex;
  83.    
  84.     /**
  85.      * The {@link IndexReader} from the {@link #atomicIndex}.
  86.      */
  87.     private IndexReader indexReader;
  88.    
  89.     /**
  90.      * The index iterator used to obtain hits.
  91.      */
  92.     private IndexIterator indexIterator;
  93.    
  94.     /**
  95.      * The positions iterator for the latest document.
  96.      */
  97.     private IntIterator positionsIterator;
  98.    
  99.    
  100.     /**
  101.      * @param node
  102.      * @param invertedIndex
  103.      * @throws IOException if the index files cannot be accessed.
  104.      */
  105.     public TermQueryExecutor(TermQuery node, QueryEngine engine) throws IOException {
  106.       super(engine, node);
  107.       this.query = node;
  108.       atomicIndex = query.getIndex(engine);

  109.       if(atomicIndex == null) throw new IllegalArgumentException(
  110.               "No index provided for field " + node.getIndexName() + "!");
  111.       Index mg4jIndex = atomicIndex.getIndex();
  112.       if(mg4jIndex != null) {
  113.         indexReader = mg4jIndex.getReader();      
  114.         // if we have the term ID, use that
  115.         if(query.termId != DocumentIterator.END_OF_LIST) {
  116.           this.indexIterator = indexReader.documents(query.termId);
  117.           // set the term (used by rankers)
  118.           MutableString mutableString = new MutableString(query.getTerm());
  119.           atomicIndex.getIndex().termProcessor.processTerm(mutableString);
  120.           this.indexIterator.term(mutableString);
  121.         } else {
  122.           //use the term processor for the query term
  123.           MutableString mutableString = new MutableString(query.getTerm());
  124.           atomicIndex.getIndex().termProcessor.processTerm(mutableString);
  125.           this.indexIterator = indexReader.documents(mutableString.toString());        
  126.         }        
  127.       } else {
  128.         // the atomic index is empty: we have exhausted the search already
  129.         latestDocument = -1;
  130.       }

  131.       positionsIterator = null;
  132.     }

  133.     /* (non-Javadoc)
  134.      * @see gate.mimir.search.query.QueryExecutor#nextDocument()
  135.      */
  136.     public long nextDocument(long from) throws IOException {
  137.       if(closed || from+1 >= atomicIndex.getIndex().numberOfDocuments) return latestDocument = -1;
  138.       if(latestDocument == -1){
  139.         //we have exhausted the search already
  140.         return latestDocument;
  141.       }
  142.      
  143.       if (from >= latestDocument){
  144.         //we do need to skip
  145.         latestDocument = indexIterator.skipTo(from + 1);
  146.       }else{
  147.         //from is lower than latest document,
  148.         //so we just return the next document
  149.         latestDocument = indexIterator.nextDocument();
  150.       }
  151.       if(latestDocument == DocumentIterator.END_OF_LIST){
  152.         //no more documents available
  153.         latestDocument = -1;
  154.       } else {
  155.         positionsIterator = IndexIterators.positionIterator(indexIterator);
  156.       }
  157.       return latestDocument;
  158.     }

  159.     /* (non-Javadoc)
  160.      * @see gate.mimir.search.query.QueryExecutor#nextHit(java.util.Map)
  161.      */
  162.     public Binding nextHit() throws IOException{
  163.       if(closed) return null;
  164.       if(positionsIterator == null) positionsIterator =
  165.           IndexIterators.positionIterator(indexIterator);
  166.       if(latestDocument >= 0 && positionsIterator.hasNext()){
  167.         int position = positionsIterator.nextInt();
  168.         return new Binding(query, latestDocument, position, query.length, null);
  169.       }else{
  170.         //no more positions, or no more documents
  171.         return null;
  172.       }
  173.     }

  174.     /* (non-Javadoc)
  175.      * @see gate.mimir.search.query.QueryExecutor#close()
  176.      */
  177.     public void close() throws IOException {
  178.       if(closed) return;
  179.       super.close();
  180.       indexIterator = null;
  181.       if(indexReader != null) indexReader.close();
  182. //      atomicIndex.returnReader(indexReader);
  183.     }

  184.    
  185.     /* (non-Javadoc)
  186.      * @see it.unimi.di.big.mg4j.index.IndexIterator#nextPosition()
  187.      */
  188.     @Override
  189.     public int nextPosition() throws IOException {
  190.       // TODO Auto-generated method stub
  191.       throw new UnsupportedOperationException("Method not implemented!");
  192.     }

  193.     public boolean hasNext() {
  194.       throw new UnsupportedOperationException("Method not implemented!");
  195.     }

  196.     public Integer next() {
  197.       throw new UnsupportedOperationException("Method not implemented!");
  198.     }

  199.     public void remove() {
  200.       throw new UnsupportedOperationException("Method not implemented!");
  201.     }

  202.     public Index index() {
  203.       return indexIterator.index();
  204.     }

  205.     public IntervalIterator intervalIterator() throws IOException {
  206.       return indexIterator.intervalIterator();
  207.     }

  208.     public long frequency() throws IOException {
  209.       return indexIterator.frequency();
  210.     }

  211.     public IntervalIterator intervalIterator(Index index) throws IOException {
  212.       return indexIterator.intervalIterator(index);
  213.     }

  214.     public Payload payload() throws IOException {
  215.       return indexIterator.payload();
  216.     }

  217.     public int count() throws IOException {
  218.       return indexIterator.count();
  219.     }

  220.     public Reference2ReferenceMap<Index, IntervalIterator> intervalIterators()
  221.       throws IOException {
  222.       return indexIterator.intervalIterators();
  223.     }

  224.     public ReferenceSet<Index> indices() {
  225.       return indexIterator.indices();
  226.     }

  227.     public IndexIterator id(int id) {
  228.       return indexIterator.id(id);
  229.     }

  230.     public long nextDocument() throws IOException {
  231.       return indexIterator.nextDocument();
  232.     }

  233.     public int id() {
  234.       return indexIterator.id();
  235.     }

  236.     public long document() {
  237.       return indexIterator.document();
  238.     }

  239.     public <T> T accept(DocumentIteratorVisitor<T> visitor) throws IOException {
  240.       return indexIterator.accept(visitor);
  241.     }

  242.     public <T> T acceptOnTruePaths(DocumentIteratorVisitor<T> visitor)
  243.       throws IOException {
  244.       return indexIterator.acceptOnTruePaths(visitor);
  245.     }

  246.     public void dispose() throws IOException {
  247.       indexIterator.dispose();
  248.     }

  249.     public long termNumber() {
  250.       return indexIterator.termNumber();
  251.     }

  252.     public String term() {
  253.       return indexIterator.term();
  254.     }

  255.     public IndexIterator term(CharSequence term) {
  256.       return indexIterator.term(term);
  257.     }

  258.     public IndexIterator weight(double weight) {
  259.       return indexIterator.weight(weight);
  260.     }

  261.     public long skipTo(long n) throws IOException {
  262.       return indexIterator.skipTo(n);
  263.     }

  264.     public double weight() {
  265.       return indexIterator.weight();
  266.     }
  267.    
  268.   }
  269.  
  270.   /**
  271.    * @return the term
  272.    */
  273.   public CharSequence getTerm() {
  274.     return term;
  275.   }
  276.  
  277.   /**
  278.    * @return the termId
  279.    */
  280.   public long getTermId() {
  281.     return termId;
  282.   }

  283.   /**
  284.    * @return the indexName
  285.    */
  286.   public String getIndexName() {
  287.     return indexName;
  288.   }
  289.  
  290.   /**
  291.    * Gets the index for this query in a given {@link QueryEngine}.
  292.    * @param engine
  293.    * @return
  294.    */
  295.   public AtomicIndex getIndex(QueryEngine engine) {
  296.     switch(this.indexType){
  297.       case TOKENS:
  298.         return engine.getTokenIndex(indexName);
  299.       case ANNOTATIONS:
  300.         return engine.getAnnotationIndex(indexName);
  301.       default:
  302.         throw new IllegalArgumentException("Indexes of type " +
  303.                 indexType + " are not supported!");
  304.     }
  305.   }
  306.  
  307.  
  308.   /**
  309.    * Creates a new term query, for searching over the document text.
  310.    *
  311.    * @param indexName the name of the index to be searched. This should be one
  312.    * of the annotation feature names used for indexing tokens (see
  313.    * {@link IndexConfig.TokenIndexerConfig}).
  314.    *
  315.    * @param term the term to be searched for.
  316.    *
  317.    * @see IndexConfig.TokenIndexerConfig
  318.    */
  319.   public TermQuery(String indexName, String term) {
  320.     this(IndexType.TOKENS, indexName, term, 1);
  321.   }
  322.  
  323.   /**
  324.    * Creates a new term query, for searching over the document text.
  325.    *
  326.    * @param indexName the name of the index to be searched. This should be one
  327.    * of the annotation feature names used for indexing tokens (see
  328.    * {@link IndexConfig.TokenIndexerConfig}).
  329.    *
  330.    * @param termId the term ID for the term to be searched for.
  331.    *
  332.    * @see IndexConfig.TokenIndexerConfig
  333.    */
  334.   public TermQuery(String indexName, String term, long termId) {
  335.     this(IndexType.TOKENS, indexName, term, termId, 1);
  336.   }
  337.  
  338.   /**
  339.    * Creates a new term query, for searching over semantic annotations.
  340.    *  
  341.    * @param annotationType the type of annotation sought. This should one of the
  342.    * annotation types used when indexing semantic annotations (see
  343.    * {@link IndexConfig.SemanticIndexerConfig}).
  344.    *
  345.    * @param mentionURI the URI of the mention sought.
  346.    *
  347.    * @param length the length of the mention sought.
  348.    */
  349.   public TermQuery(String annotationType, String mentionURI, int length) {
  350.     this(IndexType.ANNOTATIONS, annotationType, mentionURI, length);
  351.   }
  352.  
  353.   /**
  354.    * Creates a new term query, for searching over semantic annotations.
  355.    *  
  356.    * @param annotationType the type of annotation sought. This should one of the
  357.    * annotation types used when indexing semantic annotations (see
  358.    * {@link IndexConfig.SemanticIndexerConfig}).
  359.    *
  360.    * @param mentionTermid the term ID for the mentionURI sought.
  361.    *
  362.    * @param length the length of the mention sought.
  363.    */
  364.   public TermQuery(String annotationType, String term, long mentionTermid, int length) {
  365.     this(IndexType.ANNOTATIONS, annotationType, term, mentionTermid, length);
  366.   }  
  367.  
  368.   /**
  369.    * Creates a new term query. This constructor is part of a low-level API. see
  370.    * the other constructors of this class, which may be more suitable!
  371.    *  
  372.    * @param indexType The type of index to be searched.
  373.    *
  374.    * @param indexName the name of the index to be searched. If the indexType is
  375.    * {@link IndexType#TOKENS}, then the name is interpreted as the feature name
  376.    * for the document tokens, if the indexType is {@link IndexType#ANNOTATIONS},
  377.    * then the name is interpreted as annotation type.
  378.    *
  379.    * @param term the term to be searched for.
  380.    *
  381.    * @param length the length of the hits (useful in the case of annotation
  382.    * indexes, where the length of each mention is stored external to the actual
  383.    * index).
  384.    */
  385.   public TermQuery(IndexType indexType, String indexName, String term, int length) {
  386.     this.indexType = indexType;
  387.     this.indexName = indexName;
  388.     this.term = term;
  389.     this.length = length;
  390.   }
  391.  
  392.   /**
  393.    * Creates a new term query. This constructor is part of a low-level API. see
  394.    * the other constructors of this class, which may be more suitable!
  395.    *  
  396.    * @param indexType The type of index to be searched.
  397.    *
  398.    * @param indexName the name of the index to be searched. If the indexType is
  399.    * {@link IndexType#TOKENS}, then the name is interpreted as the feature name
  400.    * for the document tokens, if the indexType is {@link IndexType#ANNOTATIONS},
  401.    * then the name is interpreted as annotation type.
  402.    *
  403.    * @param length the length of the hits (useful in the case of annotation
  404.    * indexes, where the length of each mention is stored external to the actual
  405.    * index).
  406.    *
  407.    * @param termId the term ID for sought term.
  408.    */
  409.   public TermQuery(IndexType indexType, String indexName, String term, long termId, int length) {
  410.     this.indexType = indexType;
  411.     this.indexName = indexName;
  412.     this.termId = termId;
  413.     this.term = term;
  414.     this.length = length;
  415.   }
  416.  
  417.  
  418.  
  419.   /**
  420.    * Gets a new query executor for this {@link TermQuery}.
  421.    * @param indexes the set of indexes running on.
  422.    * @return an appropriate {@link QueryExecutor} (in this case, an instance of
  423.    * {@link TermQueryExecutor}).
  424.    * @throws IOException if the index files cannot be accessed.
  425.    * @throws IllegalArgumentException if the provided set of indexes does not
  426.    * include an index for this query's {@link #indexName}.
  427.    * @see gate.mimir.search.query.QueryNode#getQueryExecutor(java.util.Map)
  428.    */
  429.   public QueryExecutor getQueryExecutor(QueryEngine engine) throws IOException {
  430.     return new TermQueryExecutor(this, engine);
  431.   }
  432.  
  433.   public String toString() {
  434.     return "TERM(" +
  435.         (indexName == null ? "" : indexName) +
  436.         ":" + term + ")";
  437.   }

  438.   public IndexType getIndexType() {
  439.     return indexType;
  440.   }

  441.   public int getLength() {
  442.     return length;
  443.   }
  444. }